Remove certain observations based on two criteria - dataframe

all the rows that have observations for the same date and Key are marked.
#Data
Key Date Value ...other columns
C 2000-04 0.55 name1 x1 <-
C 2000-04 0.60 name2 x2 <-
C 2000-05 1.2
A 2001-06 4
A 2001-07 5
A 2002-08 2
Ist there an easy way to remove those observations instead of summing them up?
Thanks a lot! :)

Just use duplicated twice, one with the default fromLast = FALSE and the other setting fromLast = TRUE.
Then, negate that result index.
inx <- duplicated(Data[c('Key', 'Date')]) | duplicated(Data[c('Key', 'Date')], fromLast = TRUE)
Data[!inx, ]
# Key Date Value
#3 C 2000-05 1.2
#4 A 2001-06 4.0
#5 A 2001-07 5.0
#6 A 2002-08 2.0
Data.
Data <-
structure(list(Key = structure(c(2L, 2L, 2L, 1L, 1L, 1L), .Label = c("A",
"C"), class = "factor"), Date = structure(c(1L, 1L, 2L, 3L, 4L,
5L), .Label = c("2000-04", "2000-05", "2001-06", "2001-07", "2002-08"
), class = "factor"), Value = c(0.55, 0.6, 1.2, 4, 5, 2)), class = "data.frame", row.names = c(NA,
-6L))

I am not sure whether I understand correctly. If yes, a very simple answer is unique() function.
First, you need to have very strong reason to remove these duplications.
Key <- c("C", "C", "C", "A", "A", "A", "C", "C", "C")
Value <- c(1, 2, 1, 2, 1, 2, 1, 2, 1)
df <- data.frame(Key, Value)
unique(df)

Related

Multi-row x-axis labels with breaks in R

I would like to add 2-row x-axis labels on my line plot, but not sure how to also incorporate the continuous labels and breaks I have for my 1st x-axis (Column "CYR" - short for calendar year). I'd like to have the 2nd axis (WYR) start half-way between the first label and the second (WYR = 2010 starts between CYR = 2009 -> 2010). I also wasn't sure how to add 2 x-axis titles either, maybe at the beginning of each x-axis row?
My data:
> dput(wet_pivot)
structure(list(WYR = c("WR_2010", "WR_2011", "WR_2012", "WR_2013",
"WR_2014", "WR_2015", "WR_2016", "WR_2017", "WR_2018", "WR_2019",
"WR_2020", "WR_2021", "WR_2022"), CYR = c(2009, 2010, 2011, 2012,
2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021), Season = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("DRY",
"WET"), class = "factor"), N = c(59L, 63L, 69L, 70L, 72L, 71L,
71L, 72L, 71L, 68L, 70L, 48L, 72L), n_mean = c(0.00696806934430411,
0.000649730847004026, 0.00288256551918419, 0.01141088388474,
0.000536174103147671, 0.00349584646220785, 0.000482925207291882,
0.00245359625194744, 0.00292096956686587, 0.00252817293686805,
0.00196286772014134, 0.00501799463867351, 0.00132244297252478
), n_median = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), sd = c(0.030946706350869,
0.00248965525641742, 0.0100973832581282, 0.051577934580242, 0.00331468784320076,
0.0266064084754242, 0.00212505905295283, 0.00675243933898364,
0.0119729983336735, 0.00639785127193391, 0.00930625647382774,
0.0136275258272549, 0.00543420856675111), se = c(0.00402891799826298,
0.000313667078988821, 0.00121558209746373, 0.0061647423020683,
0.000390639708573979, 0.00315759975690469, 0.000252198110662322,
0.000795782607691024, 0.00142093348159893, 0.000775853428563995,
0.00111231039833223, 0.00196696392618855, 0.000640427621321956
)), row.names = c(NA, -13L), class = "data.frame")
My attempt:
years <- seq(2009,2021,1)
labs <- seq(2009,2021,by=1)
myplot <- ggplot(wet_pivot, aes(x = CYR, y = n_mean)) +
geom_errorbar(aes(ymin=n_mean-se, ymax=n_mean+se), width=.2, color = "black") +
geom_point(color = "black", shape = 1, size = 2) +
geom_line(color = "black") +
scale_y_continuous(limits = c(0, 0.04), expand = expansion(mult = c(0, 0.05))) +
scale_x_continuous(breaks= years, labels = labs)
myplot +
annotate(geom = "text",
x = 1:nrow(wet_pivot),
y = min(wet_pivot$n_mean),
label = labs,
vjust = 3.5) +
annotate(geom = "text",
x = 1:nrow(wet_pivot),
y = min(wet_pivot$n_mean),
label = wet_pivot$WYR,
vjust = 5)
You indeed can use text annotations to substitute for x-axis labels. A few recommendations:
Set y = -Inf to automatically place text as bottom, independent of whatever data is on the plot. vjust can indeed be used to place it further down.
You'd need coord_cartesian(clip = "off") to actually show the text.
You can place 'titles' with an extra annotation layer, with x = -Inf to place it on the left.
I used the above for the example below. Maybe the text is still to big, so you could set the 8.8 / .pt to something smaller. (The / .pt translates between mm, which geom_text() uses, to points, which is used in theme)
library(ggplot2)
# wet_pivot <- structure(...) # omitted for previty
ggplot(wet_pivot, aes(x = CYR, y = n_mean)) +
geom_errorbar(aes(ymin=n_mean-se, ymax=n_mean+se), width=.2, color = "black") +
geom_point(color = "black", shape = 1, size = 2) +
geom_line(color = "black") +
scale_y_continuous(limits = c(0, 0.04), expand = expansion(mult = c(0, 0.05))) +
scale_x_continuous(breaks= years, labels = ~ rep("", length(.x))) +
annotate(geom = "text",
x = wet_pivot$CYR,
y = -Inf,
label = labs,
size = 8.8 / .pt,
vjust = 2.5) +
annotate(geom = "text",
x = wet_pivot$CYR,
y = -Inf,
label = wet_pivot$WYR,
size = 8.8 / .pt,
vjust = 4) +
# Titles
annotate(geom = "text",
x = -Inf,
y = -Inf,
label = c("CYR", "WYR"),
vjust = c(2.5, 4), hjust = 1,
size = 8.8 / .pt
) +
coord_cartesian(clip = "off") +
theme(
# Make extra space between axis ticks and axis title
axis.text.x.bottom = element_text(margin = margin(t = 8.8, b = 8.8))
)
Created on 2022-05-19 by the reprex package (v2.0.1)

Subscripts when using bquote in geom_text followed by text

I've been looking through stackoverflow for the past few hours and couldn't find a solution which works for my problem.
The issue: I'm trying to use geom_text to manually write in a manually calculated value, which is fine when I just write the value and its unit using bquote (for example just writing 4.4µm is fine). However, I would like to write out D[g]= 4.4µm (with the g subsetted!) in the top left corner using geom_text. But once I start trying to add a subscript to it, it doesn't work and I get one of the following errors:
Error: Don't know how to add RHS to theme object
Error: Aesthetics must be either length 1 or the same as the data (14): label
Error: unexpected string constant in:
" #geom_text(label = bquote("D" [g] "= 4.4µm"), x = -.3, y = 900, size = 10) +
geom_text(label = bquote(D[g]"=4.4µm"
Here is the code I've been using thus far:
A = structure(list(`Size (µm)` = c(0.85, 0.85, 1.6, 1.6, 2.7, 2.7,
4, 4, 5.85, 5.85, 9, 9, 20, 20), `C/dlogd` = c(0, 70.7482842209313,
70.7482842209313, 147.721992341133, 147.721992341133, 752.133343128365,
752.133343128365, 296.076678012312, 296.076678012312, 226.648066580862,
226.648066580862, 302.286593848111, 302.286593848111, 0), `+s` = c(0,
175.47011068069, 175.47011068069, 243.15534458114, 243.15534458114,
1007.91042406178, 1007.91042406178, 439.475898343651, 439.475898343651,
366.578774657598, 366.578774657598, 385.065980566499, 385.065980566499,
0), `-s` = c(0, -33.9735422388272, -33.9735422388272, 52.2886401011273,
52.2886401011273, 496.356262194949, 496.356262194949, 152.677457680973,
152.677457680973, 86.7173585041254, 86.7173585041254, 219.507207129723,
219.507207129723, 0), Sampling = c("A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A"), Date = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("21st of February 2019",
"19th of June 2019", "3rd of July 2019", "17th of July 2019"), class = "factor"),
Datenoyear = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = c("21st of February", "19th of June",
"3rd of July", "17th of July"), class = "factor"), DateNumber = c(21.2,
21.2, 21.2, 21.2, 21.2, 21.2, 21.2, 21.2, 21.2, 21.2, 21.2,
21.2, 21.2, 21.2)), row.names = c(NA, -14L), class = c("tbl_df",
"tbl", "data.frame"))
ggplot(A, aes(x= `Size (µm)`)) +
geom_line(aes(y = `C/dlogd`), size = 1.5) +
geom_line(aes(y = `+s`), linetype = "twodash", color = "red") +
geom_line(aes(y = `-s`), linetype = "dashed" , color = "blue") +
geom_point(aes(y = `+s`), color = "red") +
geom_point(aes(y = `C/dlogd`)) +
geom_point(aes(y = `-s`), color = "blue") +
scale_x_log10(limits = c(0.4,21)) +
#geom_text(label = bquote("D" [g] "= 4.4µm"), x = -.3, y = 900, size = 10) + #gives the second error
#geom_text(label = bquote(D[g]"=4.4µm"), x = -.1, y = 900, size = 10) + #gives the 3rd error
# geom_text(label = bquote("4.4µm"), x = -0.3, y = 900, size = 10) + #No error, but doesn't give me the "D[g] = " I need
theme_bw() +
annotation_logticks(side = "b") +
facet_grid(~Datenoyear) +
xlab(element_blank()) +
ylab(element_blank())
What is it that I'm doing wrong? I've tried looking at these solutions:
http://rchaeology.blogspot.com/2012/11/combining-superscripts-subscripts-and.html
https://community.rstudio.com/t/use-bquote-in-ggplot2-for-subscript-text/40882
writing a label in R for a plot using text, and subscripts using either bquote, paste or expression
But unfortunately none of them have worked. Any help would be appreciated.
Unfortunately I can't tell you what the problem is with your code. But instead of using bquote the desired result can be achieved using as.character(expression(paste(D[g], "=4.4µm"))). Source: see here. Try this:
ggplot(A, aes(x= `Size (µm)`)) +
geom_line(aes(y = `C/dlogd`), size = 1.5) +
geom_line(aes(y = `+s`), linetype = "twodash", color = "red") +
geom_line(aes(y = `-s`), linetype = "dashed" , color = "blue") +
geom_point(aes(y = `+s`), color = "red") +
geom_point(aes(y = `C/dlogd`)) +
geom_point(aes(y = `-s`), color = "blue") +
geom_text(label = as.character(expression(paste(D[g], "=4.4µm"))), parse = TRUE, x = -.1, y = 900, size = 10) + #gives the 3rd error
scale_x_log10(limits = c(0.4, 21)) +
theme_bw() +
annotation_logticks(side = "b") +
facet_grid(~Datenoyear) +
xlab(element_blank()) +
ylab(element_blank())

Concatenating 2 dataframes vertically with empty row in middle

I have a multindex dataframe df1 as:
node A1 A2
bkt B1 B2
Month
1 0.15 -0.83
2 0.06 -0.12
bs.columns
MultiIndex([( 'A1', 'B1'),
( 'A2', 'B2')],
names=[node, 'bkt'])
and another similar multiindex dataframe df2 as:
node A1 A2
bkt B1 B2
Month
1 -0.02 -0.15
2 0 0
3 -0.01 -0.01
4 -0.06 -0.11
I want to concat them vertically so that resulting dataframe df3 looks as following:
df3 = pd.concat([df1, df2], axis=0)
While concatenating I want to introduce 2 blank row between dataframes df1 and df2. In addition I want to introduce two strings Basis Mean and Basis P25 in df3 as shown below.
print(df3)
Basis Mean
node A1 A2
bkt B1 B2
Month
1 0.15 -0.83
2 0.06 -0.12
Basis P25
node A1 A2
bkt B1 B2
Month
1 -0.02 -0.15
2 0 0
3 -0.01 -0.01
4 -0.06 -0.11
I don't know whether there is anyway of doing the above.
I don't think that that is an actual concatenation you are talking about.
The following could already do the trick:
print('Basis Mean')
print(df1.to_string())
print('\n')
print('Basis P25')
print(df2.to_string())
This isn't usually how DataFrames are used, but perhaps you wish to append rows of empty strings in between df1 and df2, along with rows containing your titles?
df1 = pd.concat([pd.DataFrame([["Basis","Mean",""]],columns=df1.columns), df1], axis=0)
df1 = df1.append(pd.Series("", index=df1.columns), ignore_index=True)
df1 = df1.append(pd.Series("", index=df1.columns), ignore_index=True)
df1 = df1.append(pd.Series(["Basis","P25",""], index=df1.columns),ignore_index=True)
df3 = pd.concat([df1, df2], axis=0)
Author clarified in the comment that he wants to make it easy to print to an excel file. It can be achieved using pd.ExcelWriter.
Below is an example of how to do it.
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
import pandas as pd
#dataclass
class SaveTask:
df: pd.DataFrame
header: Optional[str]
extra_pd_settings: Optional[Dict[str, Any]] = None
def fill_xlsx(
save_tasks: List[SaveTask],
writer: pd.ExcelWriter,
sheet_name: str = "Sheet1",
n_rows_between_blocks: int = 2,
) -> None:
current_row = 0
for save_task in save_tasks:
extra_pd_settings = save_task.extra_pd_settings or {}
if "startrow" in extra_pd_settings:
raise ValueError(
"You should not use parameter 'startrow' in extra_pd_settings"
)
save_task.df.to_excel(
writer,
sheet_name=sheet_name,
startrow=current_row + 1,
**extra_pd_settings
)
worksheet = writer.sheets[sheet_name]
worksheet.write(current_row, 0, save_task.header)
has_header = extra_pd_settings.get("header", True)
current_row += (
1 + save_task.df.shape[0] + n_rows_between_blocks + int(has_header)
)
if __name__ == "__main__":
# INPUTS
df1 = pd.DataFrame(
{"hello": [1, 2, 3, 4], "world": [0.55, 1.12313, 23.12, 0.0]}
)
df2 = pd.DataFrame(
{"foo": [3, 4]},
index=pd.MultiIndex.from_tuples([("foo", "bar"), ("baz", "qux")]),
)
# Xlsx creation
writer = pd.ExcelWriter("test.xlsx", engine="xlsxwriter")
fill_xlsx(
[
SaveTask(
df1,
"Hello World Table",
{"index": False, "float_format": "%.3f"},
),
SaveTask(df2, "Foo Table with MultiIndex"),
],
writer,
)
writer.save()
As an extra bonus, pd.ExcelWriter allows to save data on different sheets in Excel and choose their names right from Python code.

Aggregate/Remove duplicate rows in DataFrame based on swapped index levels

Sample input
import pandas as pd
df = pd.DataFrame([
['A', 'B', 1, 5],
['B', 'C', 2, 2],
['B', 'A', 1, 1],
['C', 'B', 1, 3]],
columns=['from', 'to', 'type', 'value'])
df = df.set_index(['from', 'to', 'type'])
Which looks like this:
value
from to type
A B 1 5
B C 2 2
A 1 1
C B 1 3
Goal
I now want to remove "duplicate" rows from this in the following sense: for each row with an arbitrary index (from, to, type), if there exists a row (to, from, type), the value of the second row should be added to the first row and the second row be dropped. In the example above, the row (B, A, 1) with value 1 should be added to the first row and dropped, leading to the following desired result.
Sample result
value
from to type
A B 1 6
B C 2 2
C B 1 3
This is my best try so far. It feels unnecessarily verbose and clunky:
# aggregate val of rows with (from,to,type) == (to,from,type)
df2 = df.reset_index()
df3 = df2.rename(columns={'from':'to', 'to':'from'})
df_both = df.join(df3.set_index(
['from', 'to', 'type']),
rsuffix='_b').sum(axis=1)
# then remove the second, i.e. the (to,from,t) row
rows_to_keep = []
rows_to_remove = []
for a,b,t in df_both.index:
if (b,a,t) in df_both.index and not (b,a,t) in rows_to_keep:
rows_to_keep.append((a,b,t))
rows_to_remove.append((b,a,t))
df_final = df_both.drop(rows_to_remove)
df_final
Especially the second "de-duplication" step feels very unpythonic. (How) can I improve these steps?
Not sure how much better this is, but it's certainly different
import pandas as pd
from collections import Counter
df = pd.DataFrame([
['A', 'B', 1, 5],
['B', 'C', 2, 2],
['B', 'A', 1, 1],
['C', 'B', 1, 3]],
columns=['from', 'to', 'type', 'value'])
df = df.set_index(['from', 'to', 'type'])
ls = df.to_records()
ls = list(ls)
ls2=[]
for l in ls:
i=0
while i <= l[3]:
ls2.append(list(l)[:3])
i+=1
counted = Counter(tuple(sorted(entry)) for entry in ls2)

How to create multiple scatterplots & color them differently based on Day-type(as.factors)

I used sqldf function to join two tables & create a final TABLE to make the scatterplot. My final table has 6 variables as follows:-
'data.frame': 11520 obs. of 6 variables:
`$ DATE : chr "01/01/2013" "01/01/2013" "01/01/2013" "01/01/2013" ...
`$ HOUR_NUM : int 1 2 3 4 5 6 7 8 9 10 ...'
`$ CONGESTION_ZONE_CD: Factor w/ 4 levels "H","N","S","W": 1 1 1 1 1 1 1 1 1 1 ...'
`$ DAY_TYPE_CD : Factor w/ 2 levels "WD","WE": 1 1 1 1 1 1 1 1 1 1 ...'
`$ LOAD : num 182 171 158 147 141 ...'
`$ AVG_TEMP : num 66.3 65.9 66.3 65 62.9 61.4 58.3 56.7 55.5 54.3 ...'
i used the simple plot function to get the scatterplot.
`TOTAL_LOAD = WE_TABLE$LOAD'
`TEMP = WE_TABLE$AVG_TEMP
`plot(TEMP, TOTAL_LOAD, col="blue")
Works perfect! Now, i am using sqldf again to make a subset where CONGESTION_ZONE_CD='H' & rewriting that coad again for diff table(H). same thing for North, south & west too. So, basically creating 4 subset table & rewriting the same code. My first question is:'
1) Can i just output the scatterplots for all four different types of zone by some functions at once? ( i read a lot about 'by' functions but couldn't wrap my head around it to perform this).
Now, for the second part, as i make scatterplot for each zone, i know i have both Weekdays(WD) and weekend(WE) in there. but,
2.) Is there a way i could color code my scatterplot based on WD & WE (DAY_TYPE_CD as factors) (let's say blue for WD & red for WE)?
Really like to thank you guys in advance! Since, i am still a beginner in R & i have checked questions on scatterplots before i post it here. i have a feeling this might have a simple solution that i am not aware of.
Thanks again.
dput(Table)
structure(list(DATE = c("01/01/2013", "01/01/2013", "01/01/2013",
"01/01/2013", "01/01/2013", "01/01/2013", "01/01/2013", "01/01/2013",
"01/01/2013", "01/01/2013"), HOUR_NUM = 1:10, CONGESTION_ZONE_CD = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("H", "N", "S",
"W"), class = "factor"), DAY_TYPE_CD = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("WD", "WE"), class = "factor"),
LOAD = c(181.801152, 170.512058, 157.95493, 147.299889, 140.645532,
139.216646, 141.670543, 149.122035, 160.893591, 181.996018
), AVG_TEMP = c(66.3, 65.9, 66.3, 65, 62.9, 61.4, 58.3, 56.7,
55.5, 54.3)), .Names = c("DATE", "HOUR_NUM", "CONGESTION_ZONE_CD",
"DAY_TYPE_CD", "LOAD", "AVG_TEMP"), row.names = c(NA, 10L), class = "data.frame")
Much better to use ggplot for this:
# not tested...
library(ggplot2)
# all on one plot
ggplot(df) + geom_point(aes(x=AVG_TEMP,y=TOTAL_LOAD,color=CONGESTION_ZONE))
#four plots
ggplot(df) + geom_point(aes(x=AVG_TEMP,y=TOTAL_LOAD)) +
facet_wrap(~CONGESTION_ZONE)
# coloring beased on day_type
ggplot(df) + geom_point(aes(x=AVG_TEMP,y=TOTAL_LOAD, color=DAY_TYPE_CD))+
facet_wrap(~CONGESTION_ZONE)