Randomly delete and add rows based on ID - data-manipulation

Does anyone know how to randomly delete and add rows based on ID? Here is a reproducible example:
> y <- rnorm(20)
> x <- rnorm(20)
> z <- rep(1:5, 4)
> w <- rep(1:4, each=5)
> data.frame(id=z,cluster=w,x=x,y=y) #this is a balanced dataset
id cluster x y
1 1 1 0.30003855 0.65325768
2 2 1 -1.00563626 -0.12270866
3 3 1 0.01925927 -0.41367651
4 4 1 -1.07742065 -2.64314895
5 5 1 0.71270333 -0.09294102
6 1 2 1.08477509 0.43028470
7 2 2 -2.22498770 0.53539884
8 3 2 1.23569346 -0.55527835
9 4 2 -1.24104450 1.77950291
10 5 2 0.45476927 0.28642442
11 1 3 0.65990264 0.12631586
12 2 3 -0.19988983 1.27226678
13 3 3 -0.64511396 -0.71846622
14 4 3 0.16532102 -0.45033862
15 5 3 0.43881870 2.39745248
16 1 4 0.88330282 0.01112919
17 2 4 -2.05233698 1.63356842
18 3 4 -1.63637927 -1.43850664
19 4 4 1.43040234 -0.19051680
20 5 4 1.04662885 0.37842390
After randomly adding and deleting some data based on ID, the dataset looks like this and the total number of observations should match the one above:
id cluster x y
1 1 1 0.895 -0.659
2 2 1 -0.160 -0.366
3 1 2 -0.528 -0.294
4 2 2 -0.919 0.362
5 3 2 -0.901 -0.467
6 1 3 0.275 0.134
7 2 3 0.423 0.534
8 3 3 0.929 -0.953
9 4 3 1.67 0.668
10 5 3 0.286 0.0872
11 1 4 -0.373 -0.109
12 2 4 0.289 0.299
13 3 4 -1.43 -0.677
14 4 4 -0.884 1.70
15 5 4 1.12 0.386
16 1 5 -0.723 0.247
17 2 5 0.463 -2.59
18 3 5 0.234 0.893
19 4 5 -0.313 -1.96
20 5 5 0.848 -0.0613

We could sample the number of rows to delete/add from each cluster by taking a sample of size of nrow(d) from a new id-cluster ID id2. We then just add a number of rows according to that sample size and increment cluster number by one.
From inspecting your shown expected output you may wish to have a minimum nobs of 2 per cluster in the result. We can handle that in the function arguments and prevent nonsense combinations with a few stopifnots. A repeat loop breaks then when the conditions are met.
FUN <- function(d, cl.obs=2, min.cl=NA) {
l.cl <- length(unique(d$cluster))
if (is.na(min.cl)) min.cl <- l.cl
stopifnot(cl.obs <= min(table(d$cluster)))
stopifnot(min.cl <= l.cl + 1)
stopifnot(cl.obs*min.cl <= nrow(d))
d$id2 <- Reduce(paste, d[c("id", "cluster")])
repeat({
samp <- sample(d$id2, sample(1:nrow(d), 1))
l <- length(samp)
if (l == 0) {
return(d[,-5])
}
else {
a <- cbind(id=1:l, cluster= max(d$cluster) + 1,
matrix(rnorm(l*2),,2, dimnames=list(NULL, letters[24:25])))
o <- rbind(d[!d$id2 %in% samp, -5], a)
(cl.tb <- table(o$cluster))
if (all(cl.tb >= cl.obs) & length(cl.tb) >= min.cl) break
}
})
return(`rownames<-`(o, NULL))
}
set.seed(42)
FUN(d)
# id cluster x y
# 1 1 1 -0.30663859 1.37095845
# 2 2 1 -1.78130843 -0.56469817
# 3 3 1 -0.17191736 0.36312841
# 4 4 1 1.21467470 0.63286260
# 5 1 2 -0.43046913 -0.10612452
# 6 2 2 -0.25726938 1.51152200
# 7 3 2 -1.76316309 -0.09465904
# 8 4 2 0.46009735 2.01842371
# 9 5 2 -0.63999488 -0.06271410
# 10 1 3 0.45545012 1.30486965
# 11 2 3 0.70483734 2.28664539
# 12 3 3 1.03510352 -1.38886070
# 13 5 3 0.50495512 -0.13332134
# 14 2 4 -0.78445901 -0.28425292
# 15 3 4 -0.85090759 -2.65645542
# 16 4 4 -2.41420765 -2.44046693
# 17 5 4 0.03612261 1.32011335
# 18 1 5 -0.43144620 -0.78383894
# 19 2 5 0.65564788 1.57572752
# 20 3 5 0.32192527 0.64289931
Using arguments:
set.seed(666)
FUN(d, cl.obs=1)
# id cluster x y
# 1 4 1 1.21467470 0.63286260 ## just one obs in cl. 1
# 2 2 2 -0.25726938 1.51152200
# 3 3 2 -1.76316309 -0.09465904
# 4 5 2 -0.63999488 -0.06271410
# 5 1 3 0.45545012 1.30486965
# 6 3 3 1.03510352 -1.38886070
# 7 5 3 0.50495512 -0.13332134
# 8 1 4 -1.71700868 0.63595040
# 9 3 4 -0.85090759 -2.65645542
# 10 1 5 -0.08365711 0.07771005
# 11 2 5 0.25683143 2.12925556
# 12 3 5 -1.07362365 0.63895459
# 13 4 5 -0.62286788 0.26934743
# 14 5 5 0.28499111 2.29896933
# 15 6 5 1.05156653 -1.37464590
# 16 7 5 -0.25952120 0.66236713
# 17 8 5 0.02230428 0.48351632
# 18 9 5 -0.01440929 1.23229183
# 19 10 5 1.33285534 -1.77762517
# 20 11 5 0.14842679 0.88552740
Data:
d <- structure(list(id = c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L,
1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), cluster = c(1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L,
4L, 4L), x = c(-0.306638594078475, -1.78130843398, -0.171917355759621,
1.2146746991726, 1.89519346126497, -0.4304691316062, -0.25726938276893,
-1.76316308519478, 0.460097354831271, -0.639994875960119, 0.455450123241219,
0.704837337228819, 1.03510352196992, -0.608926375407211, 0.50495512329797,
-1.71700867907334, -0.784459008379496, -0.850907594176518, -2.41420764994663,
0.0361226068922556), y = c(1.37095844714667, -0.564698171396089,
0.363128411337339, 0.63286260496104, 0.404268323140999, -0.106124516091484,
1.51152199743894, -0.0946590384130976, 2.01842371387704, -0.062714099052421,
1.30486965422349, 2.28664539270111, -1.38886070111234, -0.278788766817371,
-0.133321336393658, 0.635950398070074, -0.284252921416072, -2.65645542090478,
-2.44046692857552, 1.32011334573019)), class = "data.frame", row.names = c(NA,
-20L))

Related

Create new column which orders two previous columns

I'm looking to create a new column which is based on the ordering of two other columns, preferably using the Tidyverse functions, but any suggestions are appreciated. I have a table of around 1300 entries and several columns but a sample of my data looks something like:
Number of people
TotalOrder
TotalQuantile
12
1
1
19
2
1
21
3
2
45
5
2
53
5
3
55
6
3
60
7
4
75
8
4
But I want a fourth column which ranks TotalOrder within TotalQuantile, and to look something like:
Number of people
TotalOrder
TotalQuantile
NewOrder
12
1
1
1
19
2
1
2
21
3
2
1
45
5
2
2
53
5
3
1
55
6
3
2
60
7
4
1
75
8
4
2
I've tried a few things like filtering, arranging, etc but it's not worked out. Thanks for the help.
library(dplyr)
df <-
structure(list(
Number.of.people = c(12L, 19L, 21L, 45L, 53L, 55L, 60L, 75L),
TotalOrder = c(1L, 2L, 3L, 5L, 5L, 6L, 7L, 8L),
TotalQuantile = c(1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L)),
row.names = c(NA,-8L), class = c("tbl_df", "tbl", "data.frame"))
df %>%
group_by(TotalQuantile) %>%
mutate(NewOrder = row_number())
# A tibble: 8 x 4
# Groups: TotalQuantile [4]
Number.of.people TotalOrder TotalQuantile NewOrder
<int> <int> <int> <int>
1 12 1 1 1
2 19 2 1 2
3 21 3 2 1
4 45 5 2 2
5 53 5 3 1
6 55 6 3 2
7 60 7 4 1
8 75 8 4 2

How to shuffle according column (id) but keep descending True

I have a dataframe which is structed as followed:
>>>df
a b id
0 1 4 3
1 4 1 2
2 7 5 1
3 2 9 3
4 4 11 2
5 2 7 1
6 3 4 2
7 9 2 1
I have added paragraphs in code for readability.
Now I want to shuffle according id but keep the initial descending order of column id True. What is the best way?
A possible output would look like following:
>>>df
a b id
0 3 4 2
1 9 2 1
2 2 9 3
3 4 11 2
4 2 7 1
5 1 4 3
6 4 1 2
7 7 5 1
So in principle I just want the blocks to mix or to be randomly placed in another place.
Create groups by difference in id - each groups strat if difference is not -1 and then get unique groups ids, shuffling and change ordering by DataFrame.loc:
df['g'] = df['id'].diff().ne(-1).cumsum()
#if possible differency is not always -1
df['g'] = df['id'].ge(df['id'].shift()).cumsum()
print (df)
a b id g
0 1 4 3 1
1 4 1 2 1
2 7 5 1 1
3 2 9 3 2
4 4 11 2 2
5 2 7 1 2
6 3 4 2 3
7 9 2 1 3
ids = df['g'].unique()
np.random.shuffle(ids)
df = df.set_index('g').loc[ids].reset_index(drop=True)
print (df)
a b id
0 1 4 3
1 4 1 2
2 7 5 1
3 3 4 2
4 9 2 1
5 2 9 3
6 4 11 2
7 2 7 1
If need test groups by helper column change last reset_index(drop=True):
ids = df['g'].unique()
np.random.shuffle(ids)
df = df.set_index('g').loc[ids].reset_index()
print (df)
g a b id
0 2 3 4 2
1 2 9 2 1
2 1 2 9 3
3 1 4 11 2
4 1 2 7 1
5 0 1 4 3
6 0 4 1 2
7 0 7 5 1
Performance: In sample data, I guess repeated sorting should be reason for slowier perfromance in another solution.
#4k rows
df = pd.concat([df] * 500, ignore_index=True)
print (df)
In [70]: %%timeit
...: out = df.assign(order=df['id'].ge(df['id'].shift()).cumsum()).sample(frac=1)
...: cat = pd.CategoricalDtype(out['order'].unique(), ordered=True)
...: out = out = out.astype({'order': cat}).sort_values(['order', 'id'], ascending=False)
...:
6.13 ms ± 845 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit
df['g'] = df['id'].diff().ne(-1).cumsum()
ids = df['g'].unique()
np.random.shuffle(ids)
df.set_index('g').loc[ids].reset_index(drop=True)
3.93 ms ± 161 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Use a categorical index to sort values by block:
out = df.assign(order=df['id'].ge(df['id'].shift()).cumsum()).sample(frac=1)
cat = pd.CategoricalDtype(out['order'].unique(), ordered=True)
out = out = out.astype({'order': cat}).sort_values(['order', 'id'], ascending=False)
print(out)
# Output:
a b id order
0 1 4 3 0
1 4 1 2 0
2 7 5 1 0
6 3 4 2 2
7 9 2 1 2
3 2 9 3 1
4 4 11 2 1
5 2 7 1 1
Obviously, you can remove the order column by appending .drop(columns='order') after sort_values but I keep it here for demonstration purpose.
The key here is to set ordered=True to your new categorical dtype.
>>> cat
CategoricalDtype(categories=[1, 2, 0], ordered=True)

Insert a level o in the existing data frame such that 4 columns are grouped as one

I want to do multiindexing for my data frame such that MAE,MSE,RMSE,MPE are grouped together and given a new index level. Similarly the rest of the four should be grouped together in the same level but different name
> mux3 = pd.MultiIndex.from_product([list('ABCD'),list('1234')],
> names=['one','two'])###dummy data
> df3 = pd.DataFrame(np.random.choice(10, (3, len(mux))), columns=mux3) #### dummy data frame
> print(df3) #intended output required for the data frame in the picture given below
Assuming column groups are already in the appropriate order we can simply create an np.arange over the length of the columns and floor divide by 4 to get groups and create a simple MultiIndex.from_arrays.
Sample Input and Output:
import numpy as np
import pandas as pd
initial_index = [1, 2, 3, 4] * 3
np.random.seed(5)
df3 = pd.DataFrame(
np.random.choice(10, (3, len(initial_index))), columns=initial_index
)
1 2 3 4 1 2 3 4 1 2 3 4 # Column headers are in repeating order
0 3 6 6 0 9 8 4 7 0 0 7 1
1 5 7 0 1 4 6 2 9 9 9 9 1
2 2 7 0 5 0 0 4 4 9 3 2 4
# Create New Columns
df3.columns = pd.MultiIndex.from_arrays([
np.arange(len(df3.columns)) // 4, # Group Each set of 4 columns together
df3.columns # Keep level 1 the same as current columns
], names=['one', 'two']) # Set Names (optional)
df3
one 0 1 2
two 1 2 3 4 1 2 3 4 1 2 3 4
0 3 6 6 0 9 8 4 7 0 0 7 1
1 5 7 0 1 4 6 2 9 9 9 9 1
2 2 7 0 5 0 0 4 4 9 3 2 4
If columns are in mixed order:
np.random.seed(5)
df3 = pd.DataFrame(
np.random.choice(10, (3, 8)), columns=[1, 1, 3, 2, 4, 3, 2, 4]
)
df3
1 1 3 2 4 3 2 4 # Cannot select groups positionally
0 3 6 6 0 9 8 4 7
1 0 0 7 1 5 7 0 1
2 4 6 2 9 9 9 9 1
We can convert Index.to_series then enumerate columns using groupby cumcount then sort_index if needed to get in order:
df3.columns = pd.MultiIndex.from_arrays([
# Enumerate Groups to create new level 0 index
df3.columns.to_series().groupby(df3.columns).cumcount(),
df3.columns
], names=['one', 'two']) # Set Names (optional)
# Sort to Order Correctly
# (Do not sort before setting columns it will break alignment with data)
df3 = df3.sort_index(axis=1)
df3
one 0 1
two 1 2 3 4 1 2 3 4 # Notice Data has moved with headers
0 3 0 6 9 6 4 8 7
1 0 1 7 5 0 0 7 1
2 4 9 2 9 6 9 9 1

Pandas covid dataframe: get cases per day per county

I am reading covid-19 data from https://ti.saude.rs.gov.br/covid19/download , and I would like to:
select only rows where 'MUNICIPIO' column has value of 'SÃO LOURENÇO DO SUL';
then sort by column 'DATA_CONFIRMACAO';
then count rows in each group, getting a timeseries where "each point is the number of cases per day";
then plot with x-axis being date, and y-axis being count;
I tried this, without success:
import matplotlib.pyplot as plt
import pandas as pd
# Index(['COD_IBGE', 'MUNICIPIO', 'COD_REGIAO_COVID', 'REGIAO_COVID', 'SEXO',
# 'FAIXAETARIA', 'CRITERIO', 'DA 'FAIXAETARIA', 'CRITERIO', 'DATA_CONFIRMACAO', 'DATA_SINTOMAS',
# 'DATA_EVOLUCAO', 'EVOLUCAO', 'HOSPITALIZADO', 'FEBRE', 'TOSSE',
# 'GARGANTA', 'DISPNEIA', 'OUTROS', 'CONDICOES', 'GESTANTE',
# 'DATA_INCLUSAO_OBITO', 'DATA_EVOLUCAO_ESTIMADA', 'RACA_COR',
# 'ETNIA_INDIGENA', 'PROFISSIONAL_SAUDE', 'BAIRRO', 'HOSPITALIZACAO_SRAG',
# 'FONTE_INFORMACAO', 'PAIS_NASCIMENTO', 'PES_PRIV_LIBERDADE'],
url = "https://ti.saude.rs.gov.br/covid19/download"
data = pd.read_csv('covid-rs.csv', delimiter=';')
result = data[data['MUNICIPIO'] == 'SÃO LOURENÇO DO SUL'].groupby('DATA_CONFIRMACAO').count()
print(result)
Output is:
COD_IBGE MUNICIPIO COD_REGIAO_COVID REGIAO_COVID SEXO FAIXAETARIA CRITERIO ... ETNIA_INDIGENA PROFISSIONAL_SAUDE BAIRRO HOSPITALIZACAO_SRAG FONTE_INFORMACAO PAIS_NASCIMENTO PES_PRIV_LIBERDADE
DATA_CONFIRMACAO ...
01/07/2020 2 2 2 2 2 2 2 ... 2 2 2 2 2 2 2
01/09/2020 2 2 2 2 2 2 2 ... 2 2 2 2 2 2 2
01/12/2020 24 24 24 24 24 24 24 ... 24 24 24 24 24 24 24
02/07/2020 3 3 3 3 3 3 3 ... 3 3 3 3 3 3 3
02/09/2020 5 5 5 5 5 5 5 ... 5 5 5 5 5 5 5
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
30/11/2020 20 20 20 20 20 20 20 ... 20 20 19 20 20 20 20
31/03/2020 1 1 1 1 1 1 1 ... 1 1 1 1 1 1 1
31/07/2020 5 5 5 5 5 5 5 ... 5 5 5 5 5 5 5
31/08/2020 7 7 7 7 7 7 7 ... 7 7 7 7 7 7 7
31/10/2020 1 1 1 1 1 1 1 ... 1 1 1 1 1 1 1
[129 rows x 28 columns]
Try converting your dates to datetime type, then groupby will sort your date automatically. Plus, you would get better looking x-ticks.
url = "https://ti.saude.rs.gov.br/covid19/download"
data = pd.read_csv('covid-rs.csv', delimiter=';',
parse_dates=['DATA_CONFIRMACAO'],
dayfirst=True)
result = data[data['MUNICIPIO'] == 'SÃO LOURENÇO DO SUL'].groupby('DATA_CONFIRMACAO').count()
print(result)

Pull out all transactions after a bad credit found

Below is a small dataset of transaction records, with ID, DATE of the month, dummy variable of Bad_Credit or not. I would like to pull out all the transactions after a bad credit start.
The OUTPUT column indicate the correct result, which is row 1,2,3,5,6,8,10.
This is just an example, there could be thousands of rows. SQL, R, SPSS will all work. Thank you.
DATE
ID
Bad_CREDIT
OUTPUT
12
A
1
1
15
A
1
1
18
A
0
1
2
B
0
0
10
B
1
1
20
B
0
1
5
C
0
0
15
C
1
1
1
D
0
0
9
E
1
1
You can arrange the data by ID and DATE and for each ID assign 0 if the first value of Bad_CREDIT is 0.
library(dplyr)
df %>%
arrange(ID, DATE) %>%
group_by(ID) %>%
mutate(OUTPUT = as.integer(!(first(Bad_CREDIT) == 0 & row_number() == 1)))
# DATE ID Bad_CREDIT OUTPUT
# <int> <chr> <int> <int>
# 1 12 A 1 1
# 2 15 A 1 1
# 3 18 A 0 1
# 4 2 B 0 0
# 5 10 B 1 1
# 6 20 B 0 1
# 7 5 C 0 0
# 8 15 C 1 1
# 9 1 D 0 0
#10 9 E 1 1
data
df <- structure(list(DATE = c(12L, 15L, 18L, 2L, 10L, 20L, 5L, 15L,
1L, 9L), ID = c("A", "A", "A", "B", "B", "B", "C", "C", "D",
"E"), Bad_CREDIT = c(1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 1L)),
row.names = c(NA, -10L), class = "data.frame")
If I understand correctly, you can use window functions:
select t.*
from (select t.*,
min(case when bad_credit = 1 then date end) over (partition by id) as min_bd_date
from t
) t
where date >= min_bd_date;
You can also do this with a correlated subquery:
select t.*
from t
where t.date >= (select min(t2.date)
from t t2
where t2.id = t.id and
t2.bad_credit = 1
);
If this is in a database, then I think SQL is likely the better place to address this. However, if you already have it in R, then ...
Here's an R method, using dplyr:
library(dplyr)
dat %>%
group_by(ID) %>%
mutate(OUTPUT2 = +cumany(Bad_CREDIT)) %>%
ungroup()
# # A tibble: 10 x 5
# DATE ID Bad_CREDIT OUTPUT OUTPUT2
# <int> <chr> <int> <int> <int>
# 1 12 A 1 1 1
# 2 15 A 1 1 1
# 3 18 A 0 1 1
# 4 2 B 0 0 0
# 5 10 B 1 1 1
# 6 20 B 0 1 1
# 7 5 C 0 0 0
# 8 15 C 1 1 1
# 9 1 D 0 0 0
# 10 9 E 1 1 1
Because this is effectively a simple grouping operation, then base R and data.table solutions are as straight-forward.
+ave(dat$Bad_CREDIT, dat$ID, FUN=cumany)
# [1] 1 1 1 0 1 1 0 1 0 1
library(data.table)
datDT <- as.data.table(dat)
datDT[, OUTPUT2 := +cumany(Bad_CREDIT), by = .(ID)]
You can use EXISTS as follows:
select t.* from your_table t
where exists
(select 1
from your_table tt
where t.id = tt.id
and t.date >= tt.date
and tt.bad_credit = 1);
This is for SPSS:
sort cases by ID date.
compute PullOut=Bad_CREDIT.
if $casenum>1 and ID=lag(ID) and lag(PullOut)=1 PullOut=1.
exe.