Wilcox.test confidence intervals by group (to plot in ggplot) - ggplot2

Is there way to carry out a wilcoxon.test by group, with calculate confidence intervals, and then plot these results in ggplot?
My "data":
zero <- sample(0:0, 50, replace = TRUE)
small <- sample(1:5, 20, replace = TRUE)
medium <- sample(5:25, 15, replace = TRUE)
high <- sample(150:300, 5, replace = TRUE)
f <- function(x){
return(data.frame(ID=deparse(substitute(x)), value=x))
}
all <- bind_rows(f(zero), f(small), f(medium), f(high))
all <- as.data.frame(all[,-1])
names(all)[1] <- "value"
all$group <- c("a", "b", "c")
My attempt:
x <- ddply(all, .(group), function(x) {wilcox.test(all$value, conf.int=TRUE, conf.level=0.95)})
Error in list_to_dataframe(res, attr(.data, "split_labels"), .id, id_as_factor) :
Results must be all atomic, or all data frames
In addition: There were 12 warnings (use warnings() to see them)
I'd then like to plot the psuedo-medians with their respective confidence intervals, but I'm also not sure how to save the results for ggplot to work from.

Related

I'm having making an animated line chart, problems with the X axis

I'm trying to animate a plot I have where the X axis is non-numeric. The plot itself looks great, but I get a few error messages trying to animate it using the transition_reveal function.
I've got a data set called df100m that tracks the times/speeds of 10 meter splits of the 100 meter dash for various Olympic runners. It looks like this.
splits
runners
times(s)
speed(mph)
10-20
Bolt_08
1.070
21.93
20-30
Bolt_08
0.910
24.58
84 more rows of different splits and runners omitted for space.
Plotting the average speed for this data set using stat_smooth looks great. I removed the reaction time (RT), the final time (TOTAL), and the starting 10m (Start-10), so that it only shows the numeric splits. Here is the code for the plot I have so far:
df100m %>%
filter(!grepl("RT", splits)) %>%
filter(!grepl("TOTAL", splits)) %>%
filter(!grepl("Start-10", splits)) %>%
ggplot(mapping = aes(x = splits, y = speed, col = runner, group = runner)) +
stat_smooth(method = loess, se = F, fullrange = F) +
theme(axis.text.x = element_text(angle = 90)) +
theme(aspect.ratio = 3/7) +
theme_solarized_2(light=F)
However when I add +transition_reveal(~splits) I get the following error message:
Error in seq.default(range[1], range[2], length.out = nframes) :
'from' must be a finite number
In addition: Warning messages:
1: In min(x) : no non-missing arguments to min; returning Inf
2: In max(x) : no non-missing arguments to max; returning -Inf
Playing around with it, I sometimes also get the "invalid 'times' argument" error.
I know there are a few problems with the X axis (splits), it's a character rather than numeric, but also has a dash (-). I've seen a few posts attempting to fix this error, but I am unable to fix it as I am a beginner. Could someone point me to the right direction?
Using some minimal made-up data, this is one possible approach creating the smoothed lines before the plotting, then basing the transition_reveal on the splits mutated to integers (as splits_int).
library(tidyverse)
library(gganimate)
library(broom)
tribble(~splits, ~speed, ~runner,
"10-20", 20.0, "A",
"20-30", 21.0, "A",
"30-40", 22.0, "A",
"10-20", 19.0, "B",
"20-30", 20.0, "B",
"30-40", 21.0, "B"
) %>%
mutate(splits_int = factor(splits) %>% as.integer()) %>%
nest(data = -runner) %>%
mutate(
lm_model = map(data, ~loess(speed ~ splits_int, data = .x)),
augmented = map(lm_model, augment) %>% map(select, .fitted)
) %>%
unnest(c(augmented, data)) %>%
ggplot(aes(splits, .fitted, col = runner, group = runner)) +
geom_line() +
transition_reveal(splits_int)
Created on 2022-12-10 with reprex v2.0.2

How to delete NA from the graph

I'm still learning R and I'm not sure why there is NA data in my graph. Considering that I have used the table function to check the variables in the column.
graph
Any suggestions to remove the NA variable in my graph?
Please find below sample of code(not actual dataset):
*Install and load relevant packages
install.packages("tidyverse")
install.packages("lubridate")
install.packages("ggplot2")
install.packages("tibble")
library(tidyverse)
library(lubridate)
library(ggplot2)
library(tibble)
library(dplyr)
*data frame
all_trips <- tribble(~start, ~end, ~start_name, ~type,
"2020-03-22 03:20:20", "2020-03-22 04:10:15", "A", "member",
"2020-03-25 01:01:07", "2020-03-25 05:09:45", NA, "member",
"2020-03-26 07:09:55", "2020-03-26 08:10:20", "B", "casual",
"2020-03-29 09:10:30", "2020-03-29 09:00:20", "A", "casual",
"2020-03-30 11:09:18", "2020-03-30 03:40:10", "B", "member")
*generate new columns
all_trips$date <- as.Date(all_trips$start) #The default format is yyyy-mm-dd
all_trips$month <- format(as.Date(all_trips$date), "%m")
all_trips$day <- format(as.Date(all_trips$date), "%d")
all_trips$year <- format(as.Date(all_trips$date), "%Y")
all_trips$day_of_week <- format(as.Date(all_trips$date), "%A")
all_trips$ride_length <- difftime(all_trips$end,all_trips$start)
is.factor(all_trips$ride_length)
all_trips$ride_length <- as.numeric(as.character(all_trips$ride_length))
is.numeric(all_trips$ride_length)
*data cleaning
all_trips_v2 <- all_trips[!(all_trips$start_name == "NA" |
all_trips$ride_length<0),]
*data viz
all_trips_v2 %>%
mutate(weekday = wday(start, label = TRUE)) %>% #creates weekday field using wday()
group_by(type, weekday) %>% #groups by usertype and weekday
summarise(number_of_rides = n() #calculates the number of rides and average duration
,average_duration = mean(ride_length)) %>% # calculates the average duration
arrange(type, weekday) %>% # sorts
ggplot(aes(x = weekday, y = number_of_rides, fill = type)) +
geom_col(position = "dodge", na.rm = TRUE) +
scale_x_discrete(na.translate = FALSE)
Bar Chart:
Click here
Adding na.rmand na.translate arguments will remove missing values from bar chart without a warning message as shown here:
tibble(x = rep(c('One', 'Two', 'Two', NA),2), Group=rep(c("A","B"),each=4)) %>%
ggplot(aes(x, fill=Group)) +
labs(title="Sample Group Bar Chart with NA's Removed") +
geom_bar(stat="Count", position=position_dodge(), na.rm = TRUE) +
scale_x_discrete(na.translate = FALSE)

ggplot2: add title changes point colors <-> scale_color_manual removes ggtitle

I am facing a silly point color in a dot plot with ggplot 2. I have a whole table of data of which i take relevant rows to make a dot plot. With scale_color_manual my points get colored according to the named palette and factor genotype specified in aes() and when i simply want to add a title specifying the cell line used, the points get colored back to automatic yellow and purple. Adding the title first and setting scale_color_manual as the last layer changes the points colors and removes the title.
What is wrong in there? I don't get it and it is a bit frustrating
thanks for your help!
Here's reproducible code to get my whole df and the subset for the plots:
# df of data to plot
exp <- c(rep(284, times = 6), rep(285, times = 12))
geno <- c(rep(rep(c("WT", "KO"), each =3), times = 6))
line <- c(rep(5, times = 6),rep(8, times= 12), rep(5, times =12), rep(8, times = 6))
ttt <- c(rep(c(0, 10, 60), times = 10), rep(c("ZAc60", "Cu60", "Cu200"), times = 2))
rep <- c(rep(1, times = 12), rep(2, times = 6), rep(c(1,2), times = 6), rep(1, times = 6))
rel_expr <- c(0.20688185, 0.21576131, 0.94046028, 0.30327675, 0.22865200,
0.92941881, 0.13787508, 0.13325281, 0.22114990, 0.95591724,
1.03239718, 0.83339248, 0.15332420, 0.17558160, 0.22475604,
1.02356351, 0.77882000, 0.69214403, 0.16874097, 0.15548158,
0.45207943, 0.28123760, 0.23500083, 0.51588856, 0.1399634,
0.14610184, 1.06716713, 0.16517801, 0.34736164, 0.64773650,
0.18334429, 0.05924757, 0.01803593, 0.86685230, 0.39554685,
0.25764805)
df_all <- data.frame(exp, geno, line, ttt, rep, rel_expr)
names(df_all) <- c("EXP", "Geno", "Line", "TTT", "Rep", "Rel_Expr")
str(df_all)
# make Geno an ordered factor
df_all$Geno <- ordered(df_all$Geno, levels = c("WT", "KO"))
# select set of whole dataset for current plot
df_ions <- df_all[df_all$Line == 8 & !df_all$TTT %in% c(10, 60),]
# add a treatment as factor columns fTTT
df_ions$fTTT <- ordered(df_ions$TTT, levels = c("0", "ZAc60", "Cu60", "Cu200"))
str(df_ions)
# plot rel_exp vs factor treatment, color points by geno
# with named color palette
library(ggplot2)
col_palette <- c("#000000", "#1356BC")
names(col_palette) <- c("WT", "KO")
plt <- ggplot(df_ions, aes(x = fTTT, y = Rel_Expr, color = Geno)) +
geom_jitter(width = 0.1)
plt # intermediate_plt_1.png
plt + scale_color_manual(values = col_palette) # intermediate_plt_2.png
plt + ggtitle("mRPTEC8") # final_plot.png
images:

How to create a R shiny app for getting PCA plot

I am just starting to learn R shiny and am trying to create a shiny app that produces scatter plot for principal component analysis and allows user to choose various principal components on the X and Y axis of scatter plot. I know how to write R code to do PCA but I just cant seem to get the shiny app to get me what I need.. I have tried following the examples available for Iris kmeans clustering but I am having trouble getting the scatter plot. Here is my code so far (P.S. my original dataset has genes as rows and samples as columns (columns 1 through 10 are cancer samples, 11 through 20 are normal):
data<-read.table("genes_data.txt", header=TRUE, row.names=1)
pca_data<-prcomp(t(data), scale=T)
summary(pca_data)
pca_sig.var<-pca_data$sdev^2
pca_sig.var.per<-round(pca_sig.var/sum(pca_sig.var)*100, 1)
pca_sig.data<-data.frame(Sample=rownames(pca_data$x), PC1=pca_data$x[,1], PC2=pca_data$x[,2], PC3=pca_data$x[,3], PC4=pca_data$x[,4], PC5=pca_data$x[,5])
pca_sig.data<-pca_sig.data[-1]
pca_sig.data2<-pca_sig.data
pca_sig.data2$category=rep("CANCER", 20)
pca_sig.data2$category[11:20]=rep("NORMAL", 10)
View(pca_sig.data2)
ggplot(data=pca_sig.data2, aes(x=PC1, y=PC2, label=category, colour=category))+
geom_point(size=2, stroke=1, alpha=0.8, aes(color=category))+
xlab(paste("PCA1 - ", pca_sig.var.per[1], "%", sep=""))+
ylab(paste("PCA2 - ", pca_sig.var.per[2], "%", sep=""))+
theme_bw()+
ggtitle("My PCA Graph")
ui<-pageWithSidebar(
headerPanel('Gene Data PCA'),
sidebarPanel(
selectInput('xcol', 'X Variable', names(pca_sig.data2[,1:5])),
selectInput('ycol', 'Y Variable', names(pca_sig.data2[,1:5]),
selected=names(pca_sig.data2)[[2]])
),
mainPanel(
plotOutput('plot1')
)
)
server<- function(input, output, session) {
# Combine the selected variables into a new data frame
selectedData <- reactive({
pca_sig.data2[, c(input$xcol, input$ycol)]
})
output$plot1 <- renderPlot({
palette(c("#E41A1C", "#377EB8"))
par(mar = c(5.1, 4.1, 0, 1))
plot(selectedData(),
col=selectedData()$category,
pch = 20, cex = 3)
points(selectedData()[,1:5], pch = 4, cex = 4, lwd = 4)
})
}
shinyApp(ui = ui, server = server)
At the end, when I run the app, I get "Error:undefined columns selected"
Also, for simplicity sake let's assume that my original dataset that I want to do PCA on looks something like this (in reality I have about 600 genes and 20 samples):
probeID<-c("gene1", "gene2", "gene3", "gene4","gene5")
BCR1<-c(28.005966, 30.806433, 17.341375, 17.40666, 30.039436)
BCR2<-c(30.973469, 29.236025, 30.41161, 20.914383, 20.904331)
BCR3<-c(26.322796, 25.542833, 22.460772, 19.972183, 30.409641)
BCR4<-c(26.441898, 25.837685, 23.158352, 20.379173, 33.81327)
BCR5<-c(39.750206, 19.901133, 28.180124, 22.668673, 25.748884)
CTL6<-c(23.004385, 28.472675, 23.81621, 26.433413, 28.851719)
CTL7<-c(22.239546, 28.741674, 23.754929, 26.015385, 28.16368)
CTL8<-c(29.590443, 30.041988, 21.323061, 24.272501, 18.099016)
CTL9<-c(15.856442, 22.64224, 29.629637, 25.374926, 22.356894)
CTL10<-c(38.137985, 24.753338, 26.986668, 24.578161, 19.223558)
data<-data.frame(probeID, BCR1, BCR2, BCR3, BCR4, BCR5, CTL6, CTL7, CTL8, CTL9, CTL10)
where BCR1 through BCR5 are the cancer samples and CTL6 through CTL10 are the normal samples.
Is this what you want?
server<- function(input, output, session) {
# Combine the selected variables into a new data frame
selectedData <- reactive({
pca_sig.data2[c(input$xcol, input$ycol, 'category')]
})
output$plot1 <- renderPlot({
palette(c("#E41A1C", "#377EB8"))
plot(selectedData()[,c(1:2)], col=factor(selectedData()$category), pch = 20, cex = 3)
points(selectedData()[,c(1:2)], pch = 4, cex = 4, lwd = 4)
})
}
The result is like this:

Depth Profiling visualization

I'm trying to create a depth profile graph with the variables depth, distance and temperature. The data collected is from 9 different points with known distances between them (distance 5m apart, 9 stations, 9 different sets of data). The temperature readings are according to these 9 stations where a sonde was dropped directly down, taking readings of temperature every 2 seconds. Max depth at each of the 9 stations were taken from the boat also.
So the data I have is:
Depth at each of the 9 stations (y axis)
Temperature readings at each of the 9 stations, at around .2m intervals vertical until the bottom was reached (fill area)
distance between the stations, (x axis)
Is it possible to create a depth profile similar to this? (obviously without the greater resolution in this graph)
I've already tried messing around with ggplot2 and raster but I just can't seem to figure out how to do this.
One of the problems I've come across is how to make ggplot2 distinguish between say 5m depth temperature reading at station 1 and 5m temperature reading at station 5 since they have the same depth value.
Even if you can guide me towards another program that would allow me to create a graph like this, that would be great
[ REVISION ]
(Please comment me if you know more suitable interpolation methods, especially not needing to cut under bottoms data.)
ggplot() needs long data form.
library(ggplot2)
# example data
max.depths <- c(1.1, 4, 4.7, 7.7, 8.2, 7.8, 10.7, 12.1, 14.3)
depth.list <- sapply(max.depths, function(x) seq(0, x, 0.2))
temp.list <- list()
set.seed(1); for(i in 1:9) temp.list[[i]] <- sapply(depth.list[[i]], function(x) rnorm(1, 20 - x*0.5, 0.2))
set.seed(1); dist <- c(0, sapply(seq(5, 40, 5), function(x) rnorm(1, x, 1)))
dist.list <- sapply(1:9, function(x) rep(dist[x], length(depth.list[[x]])))
main.df <- data.frame(dist = unlist(dist.list), depth = unlist(depth.list) * -1, temp = unlist(temp.list))
# a raw graph
ggplot(main.df, aes(x = dist, y = depth, z = temp)) +
geom_point(aes(colour = temp), size = 1) +
scale_colour_gradientn(colours = topo.colors(10))
# a relatively raw graph (don't run with this example data)
ggplot(main.df, aes(x = dist, y = depth, z = temp)) +
geom_raster(aes(fill = temp)) + # geom_contour() +
scale_fill_gradientn(colours = topo.colors(10))
If you want a graph such like you showed, you have to do interpolation. Some packages give you spatial interpolation methods. In this example, I used akima package but you should think seriously that which interpolation methods to use.
I used nx = 300 and ny = 300 in below code but I think it would be better to decide those values carefully. Large nx and ny gives a high resolution graph, but don't foreget real nx and ny (in this example, real nx is only 9 and ny is 101).
library(akima); library(dplyr)
interp.data <- interp(main.df$dist, main.df$depth, main.df$temp, nx = 300, ny = 300)
interp.df <- interp.data %>% interp2xyz() %>% as.data.frame()
names(interp.df) <- c("dist", "depth", "temp")
# draw interp.df
ggplot(interp.df, aes(x = dist, y = depth, z = temp)) +
geom_raster(aes(fill = temp)) + # geom_contour() +
scale_fill_gradientn(colours = topo.colors(10))
# to think appropriateness of interpolation (raw and interpolation data)
ggplot(interp.df, aes(x = dist, y = depth, z = temp)) +
geom_raster(aes(fill = temp), alpha = 0.3) + # interpolation
scale_fill_gradientn(colours = topo.colors(10)) +
geom_point(data = main.df, aes(colour = temp), size = 1) + # raw
scale_colour_gradientn(colours = topo.colors(10))
Bottoms don't match !!I found ?interp says "interpolation only within convex hull!", oops... I'm worrid about the interpolation around the problem-area, is it OK ? If no problem, you need only cut the data under the bottoms. If not, ... I can't answer immediately (below is an example code to cut).
bottoms <- max.depths * -1
# calculate bottom values using linear interpolation
approx.bottoms <- approx(dist, bottoms, n = 300) # n must be the same value as interp()'s nx
# change temp values under bottom into NA
library(dplyr)
interp.cut.df <- interp.df %>% cbind(bottoms = approx.bottoms$y) %>%
mutate(temp = ifelse(depth >= bottoms, temp, NA)) %>% select(-bottoms)
ggplot(interp.cut.df, aes(x = dist, y = depth, z = temp)) +
geom_raster(aes(fill = temp)) +
scale_fill_gradientn(colours = topo.colors(10)) +
geom_point(data = main.df, size = 1)
If you want to use stat_contour
It is harder to use stat_contour than geom_raster because it needs a regular grid form. As far as I see your graph, your data (depth and distance) don't form a regular grid, it means it is much difficult to use stat_contour with your raw data. So I used interp.cut.df to draw a contour plot. And stat_contour have a endemic problem (see How to fill in the contour fully using stat_contour), so you need to expand your data.
library(dplyr)
# 1st: change NA into a temp's out range value (I used 0)
interp.contour.df <- interp.cut.df
interp.contour.df[is.na(interp.contour.df)] <- 0
# 2nd: expand the df (It's a little complex, so please use this function)
contour.support.func <- function(df) {
colname <- names(df)
names(df) <- c("x", "y", "z")
Range <- as.data.frame(sapply(df, range))
Dim <- as.data.frame(t(sapply(df, function(x) length(unique(x)))))
arb_z = Range$z[1] - diff(Range$z)/20
df2 <- rbind(df,
expand.grid(x = c(Range$x[1] - diff(Range$x)/20, Range$x[2] + diff(Range$x)/20),
y = seq(Range$y[1], Range$y[2], length = Dim$y), z = arb_z),
expand.grid(x = seq(Range$x[1], Range$x[2], length = Dim$x),
y = c(Range$y[1] - diff(Range$y)/20, Range$y[2] + diff(Range$y)/20), z = arb_z))
names(df2) <- colname
return(df2)
}
interp.contour.df2 <- contour.support.func(interp.contour.df)
# 3rd: check the temp range (these values are used to define contour's border (breaks))
range(interp.cut.df$temp, na.rm=T) # 12.51622 20.18904
# 4th: draw ... the bottom border is dirty !!
ggplot(interp.contour.df2, aes(x = dist, y = depth, z = temp)) +
stat_contour(geom="polygon", breaks = seq(12.51622, 20.18904, length = 11), aes(fill = ..level..)) +
coord_cartesian(xlim = range(dist), ylim = range(bottoms), expand = F) + # cut expanded area
scale_fill_gradientn(colours = topo.colors(10)) # breaks's length is 11, so 10 colors are needed
# [Note]
# You can define the contour's border values (breaks) and colors.
contour.breaks <- c(12.5, 13.5, 14.5, 15.5, 16.5, 17.5, 18.5, 19.5, 20.5)
# = seq(12.5, 20.5, 1) or seq(12.5, 20.5, length = 9)
contour.colors <- c("darkblue", "cyan3", "cyan1", "green3", "green", "yellow2","pink", "darkred")
# breaks's length is 9, so 8 colors are needed.
# 5th: vanish the bottom border by bottom line
approx.df <- data.frame(dist = approx.bottoms$x, depth = approx.bottoms$y, temp = 0) # 0 is dummy value
ggplot(interp.contour.df2, aes(x = dist, y = depth, z = temp)) +
stat_contour(geom="polygon", breaks = contour.breaks, aes(fill = ..level..)) +
coord_cartesian(xlim=range(dist), ylim=range(bottoms), expand = F) +
scale_fill_gradientn(colours = contour.colors) +
geom_line(data = approx.df, lwd=1.5, color="gray50")
bonus: legend technic
library(dplyr)
interp.contour.df3 <- interp.contour.df2 %>% mutate(temp2 = cut(temp, breaks = contour.breaks))
interp.contour.df3$temp2 <- factor(interp.contour.df3$temp2, levels = rev(levels(interp.contour.df3$temp2)))
ggplot(interp.contour.df3, aes(x = dist, y = depth, z = temp)) +
stat_contour(geom="polygon", breaks = contour.breaks, aes(fill = ..level..)) +
coord_cartesian(xlim=range(dist), ylim=range(bottoms), expand = F) +
scale_fill_gradientn(colours = contour.colors, guide = F) + # add guide = F
geom_line(data = approx.df, lwd=1.5, color="gray50") +
geom_point(aes(colour = temp2), pch = 15, alpha = 0) + # add
guides(colour = guide_legend(override.aes = list(colour = rev(contour.colors), alpha = 1, cex = 5))) + # add
labs(colour = "temp") # add
You want to treat this as a 3-D surface with temperature as the z dimension. The given plot is a contour plot and it looks like ggplot2 can do that with stat_contour.
I'm not sure how the contour lines are computed (often it's linear interpolation along a Delaunay triangulation). If you want more control over how to interpolate between your x/y grid points, you can calculate a surface model first and feed those z coordinates into ggplot2.