How to use `last()` when mutating by group with {dbplyr}?

How to use `last()` when mutating by group with {dbplyr}? - sql

Consider the following remote table:
library(dbplyr)
library(dplyr, w = F)
remote_data <- memdb_frame(
grp = c(2, 2, 2, 1, 3, 1, 1),
win = c("B", "C", "A", "B", "C", "A", "C"),
id = c(1,3,5,7,2,4,6),
)
I wish to group by grp, order by win and take the last id.
This is fairly straightforward if I collect first
# intended output when collecting first
remote_data %>%
collect() %>%
arrange(grp, win) %>%
group_by(grp) %>%
mutate(last_id = last(id)) %>%
ungroup()
#> # A tibble: 7 × 4
#> grp win id last_id
#> <dbl> <chr> <dbl> <dbl>
#> 1 1 A 4 6
#> 2 1 B 7 6
#> 3 1 C 6 6
#> 4 2 A 5 3
#> 5 2 B 1 3
#> 6 2 C 3 3
#> 7 3 C 2 2
However I cannot directly convert this to {dbplyr} code by removing collect(), though the SQL code doesn't look bad, what's happening here ?
remote_data %>%
arrange(grp, win) %>%
group_by(grp) %>%
mutate(last_id = last(id)) %>%
ungroup() %>%
print() %>%
show_query()
#> # Source: SQL [7 x 4]
#> # Database: sqlite 3.39.4 [:memory:]
#> # Ordered by: grp, win
#> grp win id last_id
#> <dbl> <chr> <dbl> <dbl>
#> 1 1 A 4 4
#> 2 1 B 7 7
#> 3 1 C 6 6
#> 4 2 A 5 5
#> 5 2 B 1 1
#> 6 2 C 3 3
#> 7 3 C 2 2
#> <SQL>
#> SELECT
#> *,
#> LAST_VALUE(`id`) OVER (PARTITION BY `grp` ORDER BY `grp`, `win`) AS `last_id`
#> FROM `dbplyr_001`
#> ORDER BY `grp`, `win`
dbplyr::window_order() allows us to override th ORDER BY clause created by the group_by(), I tried window_order(,win), but no cookie:
remote_data %>%
arrange(grp, win) %>%
group_by(grp) %>%
window_order(win) %>%
mutate(last_id = last(id)) %>%
ungroup() %>%
print() %>%
show_query()
#> # Source: SQL [7 x 4]
#> # Database: sqlite 3.39.4 [:memory:]
#> # Ordered by: win
#> grp win id last_id
#> <dbl> <chr> <dbl> <dbl>
#> 1 1 A 4 4
#> 2 1 B 7 7
#> 3 1 C 6 6
#> 4 2 A 5 5
#> 5 2 B 1 1
#> 6 2 C 3 3
#> 7 3 C 2 2
#> <SQL>
#> SELECT *, LAST_VALUE(`id`) OVER (PARTITION BY `grp` ORDER BY `win`) AS `last_id`
#> FROM `dbplyr_001`
#> ORDER BY `grp`, `win`
For some reason window_order(,grp) does trigger a window calculation but not with the expected order:
remote_data %>%
arrange(grp, win) %>%
group_by(grp) %>%
window_order(grp) %>%
mutate(last_id = last(id)) %>%
ungroup() %>%
print() %>%
show_query()
#> # Source: SQL [7 x 4]
#> # Database: sqlite 3.39.4 [:memory:]
#> # Ordered by: grp
#> grp win id last_id
#> <dbl> <chr> <dbl> <dbl>
#> 1 1 A 4 6
#> 2 1 B 7 6
#> 3 1 C 6 6
#> 4 2 A 5 5
#> 5 2 B 1 5
#> 6 2 C 3 5
#> 7 3 C 2 2
#> <SQL>
#> SELECT *, LAST_VALUE(`id`) OVER (PARTITION BY `grp` ORDER BY `grp`) AS `last_id`
#> FROM `dbplyr_001`
#> ORDER BY `grp`, `win`
What can I do to keep my initial output with only remote computations, preferably {dbplyr} code ?

It seems like you need to use window_frame():
library(dbplyr)
library(dplyr, w = F)
remote_data <- memdb_frame(
grp = c(2, 2, 2, 1, 3, 1, 1),
win = c("B", "C", "A", "B", "C", "A", "C"),
id = c(1,3,5,7,2,4,6),
)
remote_data %>%
group_by(grp) %>%
window_order(win) %>%
window_frame() |>
mutate(last_id = last(id)) %>%
ungroup() %>%
print() %>%
show_query()
#> # Source: SQL [7 x 4]
#> # Database: sqlite 3.39.4 [:memory:]
#> # Ordered by: win
#> grp win id last_id
#> <dbl> <chr> <dbl> <dbl>
#> 1 1 A 4 6
#> 2 1 B 7 6
#> 3 1 C 6 6
#> 4 2 A 5 3
#> 5 2 B 1 3
#> 6 2 C 3 3
#> 7 3 C 2 2
#> <SQL>
#> SELECT
#> *,
#> LAST_VALUE(`id`) OVER (PARTITION BY `grp` ORDER BY `win` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `last_id`
#> FROM `dbplyr_001`
Created on 2022-12-05 with reprex v2.0.2
This feels like a bug in dbplyr to me. Can you please open an issue in the dbplyr repo? Then I can fix this for the next dbplyr release.

While last() seems to be broken first() appears to work as expected, so you could make use of the order_by argument and row_number() to get the last value. You also need to move arrange() further down the pipeline and use window_order() in its place:
library(dbplyr)
library(dplyr, w = F)
remote_data %>%
window_order(grp, win) %>%
group_by(grp) %>%
mutate(rn = row_number(),
last_id = first(id, order_by = desc(rn))) %>%
arrange(grp, win, rn) %>%
ungroup() %>%
select(-rn) %>%
print() %>%
show_query()
# Source: SQL [7 x 4]
# Database: sqlite 3.39.4 [:memory:]
# Ordered by: grp, win, rn
grp win id last_id
<dbl> <chr> <dbl> <dbl>
1 1 A 4 6
2 1 B 7 6
3 1 C 6 6
4 2 A 5 3
5 2 B 1 3
6 2 C 3 3
7 3 C 2 2
<SQL>
SELECT
`grp`,
`win`,
`id`,
FIRST_VALUE(`id`) OVER (PARTITION BY `grp` ORDER BY `rn` DESC) AS `last_id`
FROM (
SELECT *, ROW_NUMBER() OVER (PARTITION BY `grp` ORDER BY `grp`, `win`) AS `rn`
FROM `dbplyr_008`
)
ORDER BY `grp`, `win`, `rn`

Here is a work around using a join, but that's not very satisfying, possibly inefficient too:
lkp <- remote_data %>%
group_by(grp) %>%
filter(win == max(win, na.rm = TRUE)) %>%
ungroup() %>%
select(grp, last_id = id) %>%
distinct()
remote_data %>%
left_join(lkp, by = "grp") %>%
arrange(grp, win) %>%
print() %>%
show_query()
#> # Source: SQL [7 x 4]
#> # Database: sqlite 3.39.4 [:memory:]
#> # Ordered by: grp, win
#> grp win id last_id
#> <dbl> <chr> <dbl> <dbl>
#> 1 1 A 4 6
#> 2 1 B 7 6
#> 3 1 C 6 6
#> 4 2 A 5 3
#> 5 2 B 1 3
#> 6 2 C 3 3
#> 7 3 C 2 2
#> <SQL>
#> SELECT *
#> FROM (
#> SELECT `LHS`.`grp` AS `grp`, `win`, `id`, `last_id`
#> FROM `dbplyr_001` AS `LHS`
#> LEFT JOIN (
#> SELECT DISTINCT `grp`, `id` AS `last_id`
#> FROM (
#> SELECT `grp`, `win`, `id`
#> FROM (
#> SELECT *, MAX(`win`) OVER (PARTITION BY `grp`) AS `q01`
#> FROM `dbplyr_001`
#> )
#> WHERE (`win` = `q01`)
#> )
#> ) AS `RHS`
#> ON (`LHS`.`grp` = `RHS`.`grp`)
#> )
#> ORDER BY `grp`, `win`
Created on 2022-12-04 with reprex v2.0.2

Related

Mutate column conditionally, if any NAs take the highest value and grab the column names

Consider this sample data
# A tibble: 10 x 3
x y z
<int> <dbl> <dbl>
1 1 1 5
2 2 3 6
3 3 4 7
4 4 7 8
5 5 NA 9
6 6 12 10
7 7 NA NA
8 8 2 20
9 9 5.5 25
10 10 NA 8
I would like to mutate a new column value that rowSums if there is no NAs present in any of the columns.
If there are, take the highest value in the row times 1.2.
BUT, if there are only one column with a value, take that value
Finally, another column NA_column with the names of columns containing NA in that row!
What I have in mind, but could not figure out the rest.
df %>%
mutate(
value = case_when(any_vars(is.na()) ~ BIGGEST VALUE * 1.2,
TRUE ~ rowsum()),
NA_column = columns that has NA in that row
)
DATA
df <- tibble(
x = 1:10,
y = c(1, 3, 4, 7, NA, 12, NA, 2, 5.5, NA),
z = c(5:10, NA, 20, 25, 8)
)

Use rowwise and c_across to perform operations
library(dplyr)
df_out <- df %>%
rowwise() %>%
mutate(
value = ifelse(anyNA(c_across(everything())), max(c_across(everything()), na.rm = T) * 1.2, x), # or 'y' or 'z' instead of x
NA_column = paste(colnames(df)[is.na(c_across(x:z))], collapse = " | ")
) %>%
ungroup()
df_out
# A tibble: 10 × 5
x y z value NA_column
<int> <dbl> <dbl> <dbl> <chr>
1 1 1 5 1 ""
2 2 3 6 2 ""
3 3 4 7 3 ""
4 4 7 8 4 ""
5 5 NA 9 10.8 "y"
6 6 12 10 6 ""
7 7 NA NA 8.4 "y | z"
8 8 2 20 8 ""
9 9 5.5 25 9 ""
10 10 NA 8 12 "y"

Often when solving a problem such as this I find it best to lay out the solution in discrete steps. In this case, using tidyverse syntax, it is possible to create temporary columns containing the bits of data needed to ultimately compute the desired value.
I couldn't immediately improve upon the solution provided for the second part (NA_column) by #Julien above, so I've added that code chunk below.
df <- tibble(
x = 1:10,
y = c(1, 3, 4, 7, NA, 12, NA, 2, 5.5, NA),
z = c(5:10, NA, 20, 25, 8)
)
out <-
df %>%
mutate(
# get number of columns of data
num_cols = ncol(.),
# get number of values in row that are not NA
num_not_na = rowSums(!is.na(.))
) %>%
rowwise() %>%
mutate(
# get maximum value of data in row, note we need to be explicit about which columns are data, e.g., (1:3)
max_value = max(across(1:3), na.rm = TRUE)
) %>%
ungroup() %>%
mutate(
# get the desired value for the row
# if there are no NA values or only one non-NA value then we can just get the row sum,
# again we need to be explicit about the columns, e.g. [, 1:3]
# otherwise take the maximum value multiplied by 1.2
value = ifelse(num_cols == num_not_na | num_not_na == 1, rowSums(.[, 1:3], na.rm = TRUE), max_value * 1.2)
)
# with credit to #Julien for the following code to get the names of the NA columns
out$NA_column <- sapply(1:nrow(out), function(i) {
out[i, ] %$%
colnames(.)[is.na(.)] %>%
paste(collapse = " | ")
})
# can optionally remove the temporary columns
out <-
out %>%
dplyr::select(
-c(num_cols, num_not_na, max_value)
)
# > out
# # A tibble: 10 x 5
# x y z value NA_column
# <int> <dbl> <dbl> <dbl> <chr>
# 1 1 1 5 7 ""
# 2 2 3 6 11 ""
# 3 3 4 7 14 ""
# 4 4 7 8 19 ""
# 5 5 NA 9 10.8 "y"
# 6 6 12 10 28 ""
# 7 7 NA NA 7 "y | z"
# 8 8 2 20 30 ""
# 9 9 5.5 25 39.5 ""
#10 10 NA 8 12 "y"

Simple way to get lagged value of a specific row

I want to create a column called lag_diff3. This column is made by the difference of the lagged value in A = 2003. I can do this using the following code, but it seems ugly. Can I rewrite this problem by using a simple way
A = c("2001", "2002", "2003", "2004")
B = c(10, 20, 60, 70)
dat = tibble(A = A, B = B) %>%
mutate(lag_diff1 = B - lag(B, 1),
lag_diff2 = ifelse(A != "2003", -100, lag_diff1),
lag_diff3 = max(lag_diff2))
> dat
# A tibble: 4 × 5
A B lag_diff1 lag_diff2 lag_diff3
<chr> <dbl> <dbl> <dbl> <dbl>
1 2001 10 NA -100 40
2 2002 20 10 -100 40
3 2003 60 40 40 40
4 2004 70 10 -100 40

You could do lag_diff1[A == "2003"]:
library(dplyr)
A = c("2001", "2002", "2003", "2004")
B = c(10, 20, 60, 70)
tibble(A = A, B = B) %>%
mutate(lag_diff1 = B - lag(B, 1),
lag_diff3 = lag_diff1[A == "2003"])
#> # A tibble: 4 × 4
#> A B lag_diff1 lag_diff3
#> <chr> <dbl> <dbl> <dbl>
#> 1 2001 10 NA 40
#> 2 2002 20 10 40
#> 3 2003 60 40 40
#> 4 2004 70 10 40

Pull out all transactions after a bad credit found

Below is a small dataset of transaction records, with ID, DATE of the month, dummy variable of Bad_Credit or not. I would like to pull out all the transactions after a bad credit start.
The OUTPUT column indicate the correct result, which is row 1,2,3,5,6,8,10.
This is just an example, there could be thousands of rows. SQL, R, SPSS will all work. Thank you.
DATE
ID
Bad_CREDIT
OUTPUT
12
A
1
1
15
A
1
1
18
A
0
1
2
B
0
0
10
B
1
1
20
B
0
1
5
C
0
0
15
C
1
1
1
D
0
0
9
E
1
1

You can arrange the data by ID and DATE and for each ID assign 0 if the first value of Bad_CREDIT is 0.
library(dplyr)
df %>%
arrange(ID, DATE) %>%
group_by(ID) %>%
mutate(OUTPUT = as.integer(!(first(Bad_CREDIT) == 0 & row_number() == 1)))
# DATE ID Bad_CREDIT OUTPUT
# <int> <chr> <int> <int>
# 1 12 A 1 1
# 2 15 A 1 1
# 3 18 A 0 1
# 4 2 B 0 0
# 5 10 B 1 1
# 6 20 B 0 1
# 7 5 C 0 0
# 8 15 C 1 1
# 9 1 D 0 0
#10 9 E 1 1
data
df <- structure(list(DATE = c(12L, 15L, 18L, 2L, 10L, 20L, 5L, 15L,
1L, 9L), ID = c("A", "A", "A", "B", "B", "B", "C", "C", "D",
"E"), Bad_CREDIT = c(1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 1L)),
row.names = c(NA, -10L), class = "data.frame")

If I understand correctly, you can use window functions:
select t.*
from (select t.*,
min(case when bad_credit = 1 then date end) over (partition by id) as min_bd_date
from t
) t
where date >= min_bd_date;
You can also do this with a correlated subquery:
select t.*
from t
where t.date >= (select min(t2.date)
from t t2
where t2.id = t.id and
t2.bad_credit = 1
);

If this is in a database, then I think SQL is likely the better place to address this. However, if you already have it in R, then ...
Here's an R method, using dplyr:
library(dplyr)
dat %>%
group_by(ID) %>%
mutate(OUTPUT2 = +cumany(Bad_CREDIT)) %>%
ungroup()
# # A tibble: 10 x 5
# DATE ID Bad_CREDIT OUTPUT OUTPUT2
# <int> <chr> <int> <int> <int>
# 1 12 A 1 1 1
# 2 15 A 1 1 1
# 3 18 A 0 1 1
# 4 2 B 0 0 0
# 5 10 B 1 1 1
# 6 20 B 0 1 1
# 7 5 C 0 0 0
# 8 15 C 1 1 1
# 9 1 D 0 0 0
# 10 9 E 1 1 1
Because this is effectively a simple grouping operation, then base R and data.table solutions are as straight-forward.
+ave(dat$Bad_CREDIT, dat$ID, FUN=cumany)
# [1] 1 1 1 0 1 1 0 1 0 1
library(data.table)
datDT <- as.data.table(dat)
datDT[, OUTPUT2 := +cumany(Bad_CREDIT), by = .(ID)]

You can use EXISTS as follows:
select t.* from your_table t
where exists
(select 1
from your_table tt
where t.id = tt.id
and t.date >= tt.date
and tt.bad_credit = 1);

This is for SPSS:
sort cases by ID date.
compute PullOut=Bad_CREDIT.
if $casenum>1 and ID=lag(ID) and lag(PullOut)=1 PullOut=1.
exe.

How to create a bucket of timestamps with a gap of X minutes in R from given timestamp values (example given)?

I have a mydf table , and it has time_stamps column with devices . I want to keep merging time_stamps as long as difference between 2 consecutive time_stamps is equal or less than 30 minutes . Starting time_stamp would be marked as start_timestamp and when gap is more than 30 minutes , then I would end that visit and classify that end as end_timestamps as given in the example given below
df<-data.frame(customer=rep("XYZ",4),device=rep("x",4),time_stamps=c("2020-05-13 07:50:06","2020-05-13 07:55:06","2020-05-13 08:05:06","2020-05-13 08:50:06"))
df1<-data.frame(customer=rep("XYZ",3),device=rep("y",3),time_stamps=c("2020-05-14 07:50:06","2020-05-14 08:15:06","2020-05-14 08:25:06"))
df2<-data.frame(customer=rep("XYZ",1),device=rep("z",1),time_stamps=c("2020-05-16 09:50:06"))
df3<-data.frame(customer=rep("XYZ",2),device=rep("a",2),time_stamps=c("2020-05-16 09:50:06","2020-05-16 19:50:06"))
df4<-data.frame(customer=rep("XYZ",2),device=rep("b",2),time_stamps=c("2020-05-17 09:50:06","2020-05-17 10:15:06"))
df5<-data.frame(customer=rep("XYZ",4),device=rep("c",4),time_stamps=c("2020-05-13 07:50:06","2020-05-13 07:55:06","2020-05-13 08:05:06","2020-05-13 08:32:06"))
mydf<-rbind(df,df1,df2,df3,df4,df5)
This is my expected data frame
expected_df<-data.frame(customer=rep("XYZ",8),device=c("x","x","y","z","a","a","b","c"),
start_timestamp=c("2020-05-13 07:50:06","2020-05-13 08:50:06","2020-05-14 07:50:06","2020-05-16 09:50:06","2020-05-16 09:50:06","2020-05-16 19:50:06","2020-05-17 09:50:06","2020-05-13 07:50:06"),
end_startstamp=c("2020-05-13 08:05:06","2020-05-13 08:50:06","2020-05-14 08:25:06","2020-05-16 09:50:06","2020-05-16 09:50:06","2020-05-16 19:50:06","2020-05-17 10:15:06","2020-05-13 08:32:06"))

The point is to create groups we can group_by. To do so we identify those records that are within 30 * 60 seconds of each other, and then use rle to consolidate them:
library(dplyr)
mydf %>%
group_by(customer, device) %>%
mutate(time_stamps = as.POSIXct(time_stamps),
diff = time_stamps - lag(time_stamps, default = first(time_stamps)),
same_group_as_lag = diff <= 30*60,
group = with(rle(same_group_as_lag), rep(seq_along(lengths), lengths)))
#> # A tibble: 16 x 6
#> # Groups: customer, device [6]
#> customer device time_stamps diff same_group_as_lag group
#> <fct> <fct> <dttm> <drtn> <lgl> <int>
#> 1 XYZ x 2020-05-13 07:50:06 0 secs TRUE 1
#> 2 XYZ x 2020-05-13 07:55:06 300 secs TRUE 1
#> 3 XYZ x 2020-05-13 08:05:06 600 secs TRUE 1
#> 4 XYZ x 2020-05-13 08:50:06 2700 secs FALSE 2
#> 5 XYZ y 2020-05-14 07:50:06 0 secs TRUE 1
#> 6 XYZ y 2020-05-14 08:15:06 1500 secs TRUE 1
#> 7 XYZ y 2020-05-14 08:25:06 600 secs TRUE 1
#> 8 XYZ z 2020-05-16 09:50:06 0 secs TRUE 1
#> 9 XYZ a 2020-05-16 09:50:06 0 secs TRUE 1
#> 10 XYZ a 2020-05-16 19:50:06 36000 secs FALSE 2
#> 11 XYZ b 2020-05-17 09:50:06 0 secs TRUE 1
#> 12 XYZ b 2020-05-17 10:15:06 1500 secs TRUE 1
#> 13 XYZ c 2020-05-13 07:50:06 0 secs TRUE 1
#> 14 XYZ c 2020-05-13 07:55:06 300 secs TRUE 1
#> 15 XYZ c 2020-05-13 08:05:06 600 secs TRUE 1
#> 16 XYZ c 2020-05-13 08:32:06 1620 secs TRUE 1
Then is just a matter of summarizing:
mydf %>%
group_by(customer, device) %>%
mutate(time_stamps = as.POSIXct(time_stamps),
diff = time_stamps - lag(time_stamps, default = first(time_stamps)),
same_group_as_lag = diff <= 30*60,
group = with(rle(same_group_as_lag), rep(seq_along(lengths), lengths))) %>%
group_by(group, add = TRUE) %>%
summarise(start_timestamp = min(time_stamps),
end_startstamp = max(time_stamps))
#> # A tibble: 8 x 5
#> # Groups: customer, device [6]
#> customer device group start_timestamp end_startstamp
#> <fct> <fct> <int> <dttm> <dttm>
#> 1 XYZ x 1 2020-05-13 07:50:06 2020-05-13 08:05:06
#> 2 XYZ x 2 2020-05-13 08:50:06 2020-05-13 08:50:06
#> 3 XYZ y 1 2020-05-14 07:50:06 2020-05-14 08:25:06
#> 4 XYZ z 1 2020-05-16 09:50:06 2020-05-16 09:50:06
#> 5 XYZ a 1 2020-05-16 09:50:06 2020-05-16 09:50:06
#> 6 XYZ a 2 2020-05-16 19:50:06 2020-05-16 19:50:06
#> 7 XYZ b 1 2020-05-17 09:50:06 2020-05-17 10:15:06
#> 8 XYZ c 1 2020-05-13 07:50:06 2020-05-13 08:32:06
Created on 2020-06-25 by the reprex package (v0.3.0)

How to do a three-way weighted table in R - similar to wtd.table

I have found MANY questions similar to mine, but either they don't want weighted tables or only want two-ways tables. I am trying to do both.
Using wtd.table, I have the following line of code:
wtd.table(fulldata2$income, fulldata2$WIHH, fulldata2$hhsize, weights = fulldata2$WGTP)
This output only provides incomes and WIHH weighted. It does not also include hhsize.
Using regular table, I get the correct output in a three-way format, but not weighted.
tab <- table(fulldata2$income, fulldata2$WIHH, fulldata2$hhsize)
tab2 <- prop.table(tab)
What function can do both three-way and weighted frequency tables? Ideally, also give it in a proportion like prop.table does.
Thanks!

First, here are some sample data (try to include these in your questions, even if it requires creating a sample data set like this). Note that I am using the tidyverse packages here:
test <-
tibble(
var1 = "A"
, var2 = "b"
, var3 = "alpha") %>%
complete(
var1 = c("A", "B")
, var2 = c("a", "b")
, var3 = c("alpha", "beta")) %>%
mutate(wt = 1:n())
So, the data are:
# A tibble: 8 x 4
var1 var2 var3 wt
<chr> <chr> <chr> <int>
1 A a alpha 1
2 A a beta 2
3 A b alpha 3
4 A b beta 4
5 B a alpha 5
6 B a beta 6
7 B b alpha 7
8 B b beta 8
The function you are looking for then is xtabs:
xtabs(wt ~ var1 + var2 + var3
, data = test)
gives:
, , var3 = alpha
var2
var1 a b
A 1 3
B 5 7
, , var3 = beta
var2
var1 a b
A 2 4
B 6 8
If you don't need the result to have the table class, you can also do this by just using count from dplyr (part of the tidyverse):
test %>%
count(var1, var2, var3
, wt = wt)
gives a tibble (a modified data.frame) with your results:
# A tibble: 8 x 4
var1 var2 var3 n
<chr> <chr> <chr> <int>
1 A a alpha 1
2 A a beta 2
3 A b alpha 3
4 A b beta 4
5 B a alpha 5
6 B a beta 6
7 B b alpha 7
8 B b beta 8
And you can then perform whatever calculations you want on it, e.g. the percent within each var3:
test %>%
count(var1, var2, var3
, wt = wt) %>%
group_by(var3) %>%
mutate(prop_in_var3 = n / sum(n))
gives:
# A tibble: 8 x 5
# Groups: var3 [2]
var1 var2 var3 n prop_in_var3
<chr> <chr> <chr> <int> <dbl>
1 A a alpha 1 0.0625
2 A a beta 2 0.1
3 A b alpha 3 0.188
4 A b beta 4 0.2
5 B a alpha 5 0.312
6 B a beta 6 0.3
7 B b alpha 7 0.438
8 B b beta 8 0.4

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

How to use `last()` when mutating by group with {dbplyr}? - sql

Related

Mutate column conditionally, if any NAs take the highest value and grab the column names

Simple way to get lagged value of a specific row

Pull out all transactions after a bad credit found

How to create a bucket of timestamps with a gap of X minutes in R from given timestamp values (example given)?

How to do a three-way weighted table in R - similar to wtd.table

Categories

Resources