dplyr distinct column values sql - sql

I use dplyr in conjunction with a PostgreSQL data base which makes a reproducible example a bit hard for me. Anyways, I want to use the distinct function to filter out messy data, i.e. duplicate timestamps. So far I have:
db <- src_postgres(dbname = "a", host = "b", port = 1234,
user = "c")
measurements <- tbl(adres_db, "measurement")
sites <- group_by(measurements, site)
sites_clean <- filter(sites,
site < 38)
sites_clean <- distinct(sites_clean, timestamp)
P_stats <- summarise(
sites_clean,
count = n(),
P = mean(p_sum)
)
collect(P_stats)
I get the error:
Error: Can't calculate distinct only on specified columns with SQL
Is there a workaround for this? Will dplyr support this in a future version?
Update
I followed the documentation and created a minimal working example using a sqlite data base (also thanks beginneR for the %>% reminder)
library(dplyr)
set.seed(1)
my_db <- src_sqlite("my_db.sqlite3", create = T)
meas <- data.frame(id = 1:30,
timestamp = sample(Sys.time() + c(1,2,3) * 3600,
size = 30, replace = TRUE),
site = sample(c(1, 2, 40), size = 30, replace = TRUE),
p_sum = rpois(30, 2))
meas_sqlite <- copy_to(my_db, meas, temporary = FALSE)
meas_tbl <- tbl(my_db, "meas")
P_stats <- group_by(meas_tbl, site, timestamp) %>%
summarise(P = mean(p_sum)) %>%
group_by(site) %>%
filter(site < 38) %>%
summarise(count = n(), P = mean(P))
collect(P_stats)
While this works, I feel it is not as clean as it could be. Also I need to try the suggestion with row_number(timestamp) == 1 on the PostgreSQL instance.

Related

ggplot2_ combining line and barplot in one graph

Let's say I'm creating the grouped barplot by something like this:
data <- data.frame(time = factor(1:3), type = LETTERS[1:4], values = runif(24)*10)
ggplot(data, aes(x = type, y = values, fill = time)) +
stat_summary(fun=mean, geom='bar', width=0.55, size = 1, position=position_dodge(0.75))
Inside each type I want to connect all bar tops (meaning to connect 3 bars for A, 3 bars for B, and so on) with the line.
I'd like to get something like that as a result:
Is there a way to do that ?
Thank you!
I changed the code to another logic that I prefer, that is to prepare the data before using ggplot().
Code
library(dplyr)
library(ggplot2)
data <- data.frame(time = factor(1:3), type = LETTERS[1:4], values = runif(24)*10)
pdata <- data %>% group_by(type,time) %>% summarise(values = mean(values,na.rm = TRUE)) %>% ungroup()
pdata %>%
ggplot(aes(x = type, y = values)) +
geom_col(
mapping = aes(fill = time, group = time),
width = 0.55,
size = 1,
position = position_dodge(0.75)
)+
geom_line(
mapping = aes(group = type),
size = 1,
position = position_dodge2(.75)
)
Output

For loop to read in multiple tables from SQLite database

I would like to create a for loop that reads in multiple tables from a SQLite database. I would like it to either read the first 300 tables, but ideally I would like to get it to read 300 random tables from my database into R.
For each table read in, I would like it to go through the written code, save the graph at the end then start over with a new table. If possible I would like the all of the tables to be on the same graph. I have written the code for a single table, but I am unsure as to how I could proceed from here.
for (i in 1:300){
# Reads the selected table in database
ind1 <- dbReadTable(mydb, i)
# Formats the SQL data to appropriate R data structure
cols <- c("Mortality", "AnimalID", "Species", "Sex", "CurrentCohort",
"BirthYear", "CaptureUnit","CaptureSubunit",
"CaptureArea", "ProjectName")
ind[cols] <- lapply(ind[cols], factor) ## as.factor() could also be used
ind$DateAndTime <- as.POSIXct(ind$DateAndTime, tz = "UTC",
origin = '1970-01-01')
# Converts the Longitude and Latitude to UTMs
ind <- convert_utm(ind1)
ind_steps <- ind %>%
# It's always a good idea to *double check* that your data are sorted
# properly before using lag() or lead() to get the previous/next value.
arrange(AnimalID, DateAndTime) %>%
# If we group_by() AnimalID, lead() will insert NAs in the proper
# places when we get to the end of one individual's data and the beginning
# of the next
group_by(AnimalID) %>%
# Now rename our base columns to reflect that they are the step's start point
rename(x1 = utm_x,
y1 = utm_y,
t1 = DateAndTime) %>%
# Attach the step's end point
mutate(x2 = lead(x1),
y2 = lead(y1),
t2 = lead(t1)) %>%
# Calculate differences in space and time
mutate(dx = x2 - x1,
dy = y2 - y1,
DateAndTime = as.numeric(difftime(t2, t1, units = "hours"))) %>%
# Calculate step length
mutate(sl = sqrt(dx^2 + dy^2)) %>%
# Calculate absolute angle
mutate(abs_angle = (pi/2 - atan2(dy, dx)) %% (2*pi)) %>%
# Calculate relative angle
mutate(rel_diff = (abs_angle - lag(abs_angle)) %% (2*pi),
rel_angle = ifelse(rel_diff > pi, rel_diff - 2*pi, rel_diff)) %>%
# Drop this uneccesary column
select(-rel_diff) %>%
# Drop incomplete final step
filter(!is.na(x2))
ind_steps <- ind_steps %>%
mutate(NSD = (x2 - x1[1])^2 + (y2 - y1[1])^2)
# Plot NSD
ind_steps %>%
ggplot(aes(x = t2, y = NSD)) +
geom_line() +
theme_bw()
}
Any help would be greatly appreciated!
If there are 1000 tables you can use sample to get random 300 from them, create a list with length 300 to store the plots and if you want to plot them together you can use cowplot::plot_grid.
random_tables <- sample(1000, 300, replace = TRUE)
plot_list <- vector('list', 300)
for (i in seq_along(random_tables)){
# Reads the selected table in database
ind1 <- dbReadTable(mydb, random_tables[i])
#...Rest of the code
#....
#....
# Plot NSD
plot_list[[i]] <- ggplot(ind_steps, aes(x = t2, y = NSD)) +
geom_line() + theme_bw()
}
cowplot::plot_grid(plotlist = plot_list, nrow = 30, ncol = 10)

Adding percentage labels to a barplot with y-axis 'count' in R

I'd like to add percentage labels per gear to the bars but keep the count y-scale.
E.g. 10% of all 'gear 3' are '4 cyl'
library(ggplot)
ds <- mtcars
ds$gear <- as.factor(ds$gear)
p1 <- ggplot(ds, aes(gear, fill=gear)) +
geom_bar() +
facet_grid(cols = vars(cyl), margins=T)
p1
Ideally only in ggplot, wihtout adding dplyr or tidy. I found some of these solutions but then I get other issues with my original data.
EDIT: Suggestions that this is a duplicate from:
enter link description here
I saw this also earlier, but wasn't able to integrate that code into what I want:
# i just copy paste some of the code bits and try to reconstruct what I had earlier
ggplot(ds, aes(gear, fill=gear)) +
facet_grid(cols = vars(cyl), margins=T) +
# ..prop.. meaning %, but i want to keep the y-axis as count
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
# not sure why, but I only get 100%
geom_text(aes( label = scales::percent(..prop..),
y= ..prop.. ), stat= "count", vjust = -.5)
The issue is that ggplot doesn't know that each facet is one group. This very useful tutorial helps with a nice solution. Just add aes(group = 1)
P.S. At the beginning, I was often quite reluctant and feared myself to manipulate my data and pre-calculate data frames for plotting. But there is no need to fret! It is actually often much easier (and safer!) to first shape / aggregate your data into the right form and then plot/ analyse the new data.
library(tidyverse)
library(scales)
ds <- mtcars
ds$gear <- as.factor(ds$gear)
First solution:
ggplot(ds, aes(gear, fill = gear)) +
geom_bar() +
facet_grid(cols = vars(cyl), margins = T) +
geom_text(aes(label = scales::percent(..prop..), group = 1), stat= "count")
edit to reply to comment
Showing percentages across facets is quite confusing to the reader of the figure and I would probably recommend against such a visualization. You won't get around data manipulation here. The challenge is here to include your "facet margin". I create two summary data frames and bind them together.
ds_count <-
ds %>%
count(cyl, gear) %>%
group_by(gear) %>%
mutate(perc = n/sum(n)) %>%
ungroup %>%
mutate(cyl = as.character(cyl))
ds_all <-
ds %>%
count(cyl, gear) %>%
group_by(gear) %>%
summarise(n = sum(n)) %>%
mutate(cyl = 'all', perc = 1)
ds_new <- bind_rows(ds_count, ds_all)
ggplot(ds_new, aes(gear, fill = gear)) +
geom_col(aes(gear, n, fill = gear)) +
facet_grid(cols = vars(cyl)) +
geom_text(aes(label = scales::percent(perc)), stat= "count")
IMO, a better way would be to simply swap x and facetting variables. Then you can use ggplots summarising function as above.
ggplot(ds, aes(as.character(cyl), fill = gear)) +
geom_bar() +
facet_grid(cols = vars(gear), margins = T) +
geom_text(aes(label = scales::percent(..prop..), group = 1), stat= "count")
Created on 2020-02-07 by the reprex package (v0.3.0)

SparkR. SQL. Count records satisfying criteria within rolling time window using timestamps

I have a dataset with a structure similar to the df you get from this:
dates<- base::seq.POSIXt(from=as.POSIXlt(as.Date("2018-01-01"),
format="%Y-%m-%d"), to=as.POSIXlt(as.Date("2018-01-03"), format="%Y-%m-%d"), by = "hour")
possible_statuses<- c('moving', 'stopped')
statuses4demo<- base::sample(possible_statuses, size=98, replace = TRUE, prob = c(.75, .25))
hours_back<- 5
hours_back_milliseconds<- hours_back*3600 * 1000
# Generate dataframe
df<- data.frame(date=rep(dates,2), user_id=c(rep("user_1", 49), rep("user_2", 49)), status=statuses4demo)
df$row_id<- seq(from=1,to=nrow(df), by=1)
df$eventTimestamp<- as.numeric(format(df$date, "%s"))*1000
df$hours_back_timestamp<- df$eventTimestamp - hours_back_milliseconds
df$num_stops_within_past_5_hours<- 0
I would like to get a dataframe with rolling counts for the number of observations with a status of "stopped" for each row. To do this in R, I just made a couple nested loops, i.e., ran this:
for(i in 1:length(unique(df$user_id))){
the_user<- unique(df$user_id)[i]
filtered_data<- df[which(df$user_id == the_user),]
for(j in 1:nrow(filtered_data)){
the_row_id<- filtered_data$row_id[j]
the_time<- filtered_data$eventTimestamp[j]
the_past_time<- filtered_data$hours_back_timestamp[j]
num_stops_in_past_interval<- base::nrow(filtered_data[filtered_data$eventTimestamp >= the_past_time & filtered_data$eventTimestamp < the_time & filtered_data$status == "stopped",])
df$num_stops_within_past_5_hours[which(df$row_id==the_row_id)]<- num_stops_in_past_interval
}
}
View(df)
I am trying to do the same thing, but either by using the built in functions in SparkR or (I think more likely) an SQL statement. I am wondering if anyone knows how I could reproduce the output from the df, but inside a Spark context? Any help is much appreciated. Thank you in advance. --Nate
Start with this data:
sdf<- SparkR::createDataFrame(df[, c("date", "eventTimestamp", "status", "user_id", "row_id")])
This solution works for the sample data as you have it set up, but isn't a more general solution for observations with any arbitrary timestamp.
ddf <- as.DataFrame(df)
ddf$count <- ifelse(ddf$status == "stopped", 1, 0)
# Create a windowSpec partitioning by user_id and ordered by date
ws <- orderBy(windowPartitionBy("user_id"), "date")
# Get the cumulative sum of the count variable by user id
ddf$count <- over(sum(ddf$count), ws)
# Get the lagged value of the cumulative sum from 5hrs ago
ddf$lag_count <- over(lag(ddf$count, offset = 5, default = 0), ws)
# The count of stops in the last 5hrs is the difference between the two
ddf$num_stops_within_past_5_hours <- ddf$count - ddf$lag_count
Edited to add a more general solution that can handle inconsistent time breaks
# Using a sampled version of the original df to create inconsistent
time breaks
ddf <- as.DataFrame(df[base::sample(nrow(df), nrow(df) - 20), ])
ddf$count <- ifelse(ddf$status == "stopped", 1, 0)
to_join <- ddf %>% select("count", "eventTimestamp", "user_id") %>% rename(eventTimestamp_ = .$eventTimestamp, user_id_ = .$user_id)
ddf$count <- NULL
# join in each row where the event timestamp is within the interval
ddf_new <- join(ddf, to_join, ddf$hours_back_timestamp <= to_join$eventTimestamp_ & ddf$eventTimestamp >= to_join$eventTimestamp_ & ddf$user_id == to_join$user_id_, joinType = "left")
ddf_new <- ddf_new %>% groupBy(
'date',
'eventTimestamp',
'user_id',
'status',
'row_id',
'hours_back_timestamp') %>%
agg(num_stops_within_past_5_hours = sum(ddf_new$count))

Using Shiny to plot database values

I have looked through many different posts on SQL and RODBC, but for some reason I cannot seem to figure this out. I have created a Shiny App. A user will enter the date range of choice and a plot should show with the queried data. However, this does not work. I have tried hard coding the days in the SQL string and that WORKS. So I have pinpointed the problem in the dates not properly carrying over to the sql string. I have tried paste0() , but read that sprintf() works better for having multiple values. BTW, I am querying a PI server. Here is my code:
Server
library(shiny)
library(RODBC)
shinyServer(function(input, output) {
Connection <- odbcConnect(dsn = "PIWHI", believeNRows = FALSE, rows_at_time =1)
x <- reactive({input$range[1]})
y <- reactive({input$range[2]})
query <- sprintf("SELECT time, value
FROM picomp
WHERE tag = 'A80100'
AND time >= DATE('%s')
AND time <= DATE('%s')", x,y)
LineX <- sqlQuery(WhitingPI, query)
Gravity <- LineX$VALUE
Time <- LineX$TIME
output$den <- renderPlot({plot(Time,Gravity)})
}
)
UI
library(shiny)
library(reshape)
shinyUI(pageWithSidebar(
# Application title
headerPanel("Monitoring Tool"),
sidebarPanel(
dateRangeInput('range',
label = 'Date input: dd-mm-yyyy. Controls start and end of date range input in main panel.',
start = Sys.Date()-1, end = Sys.Date(),
format = "dd-M-yyyy", startview = 'year', language = 'en', weekstart = 1),
sliderInput("sigmacoef", "Confidence Level",min = 0, max = 5, value = 2, step =0.5),
submitButton("Update View")
),
mainPanel(
plotOutput("den")
)
))