how to calculate difference between dates in BigQuery - google-bigquery

I have a table named Employees with Columns: PersonID, Name, StartDate. I want to calculate 1) difference in days between the newest and oldest employee and 2) the longest period of time (in days) without any new hires. I have tried to use DATEDIFF, however the dates are in a single column and I'm not sure what other method I should use. Any help would be greatly appreciated

Below is for BigQuery Standard SQL
#standardSQL
SELECT
SUM(days_before_next_hire) AS days_between_newest_and_oldest_employee,
MAX(days_before_next_hire) - 1 AS longest_period_without_new_hire
FROM (
SELECT
DATE_DIFF(
StartDate,
LAG(StartDate) OVER(ORDER BY StartDate),
DAY
) days_before_next_hire
FROM `project.dataset.your_table`
)
You can test, play with above using dummy data as in the example below
#standardSQL
WITH `project.dataset.your_table` AS (
SELECT DATE '2019-01-01' StartDate UNION ALL
SELECT '2019-01-03' StartDate UNION ALL
SELECT '2019-01-13' StartDate
)
SELECT
SUM(days_before_next_hire) AS days_between_newest_and_oldest_employee,
MAX(days_before_next_hire) - 1 AS longest_period_without_new_hire
FROM (
SELECT
DATE_DIFF(
StartDate,
LAG(StartDate) OVER(ORDER BY StartDate),
DAY
) days_before_next_hire
FROM `project.dataset.your_table`
)
with result
Row days_between_newest_and_oldest_employee longest_period_without_new_hire
1 12 9
Note use of -1 in calculating longest_period_without_new_hire - it is really up to you to use this adjustment or not depends on your preferences of counting gaps

1) difference in days between the newest and oldest record
WITH table AS (
SELECT DATE(created_at) date, *
FROM `githubarchive.day.201901*`
WHERE _table_suffix<'2'
AND repo.name = 'google/bazel-common'
AND type='ForkEvent'
)
SELECT DATE_DIFF(MAX(date), MIN(date), DAY) max_minus_min
FROM table
2) the longest period of time (in days) without any new records
WITH table AS (
SELECT DATE(created_at) date, *
FROM `githubarchive.day.201901*`
WHERE _table_suffix<'2'
AND repo.name = 'google/bazel-common'
AND type='ForkEvent'
)
SELECT MAX(diff) max_diff
FROM (
SELECT DATE_DIFF(date, LAG(date) OVER(ORDER BY date), DAY) diff
FROM table
)

Related

Select latest 30 dates for each unique ID

This is a sample data file
Data Contains unique IDs with different latitudes and longitudes on multiple timestamps.I would like to select the rows of latest 30 days of coordinates for each unique ID.Please help me on how to run the query .This date is in Hive table
Regards,
Akshay
According to your example above (where no current year dates for id=2,3), you can numbering date for each id (order by date descending) using window function ROW_NUMBER(). Then just get latest 30 values:
--get all values for each id where num<=30 (get last 30 days for each day)
select * from
(
--numbering each date for each id order by descending
select *, row_number()over(partition by ID order by DATE desc)num from Table
)X
where num<=30
If you need to get only unique dates (without consider time) for each id, then can try this query:
select * from
(
--numbering date for each id
select *, row_number()over(partition by ID order by new_date desc)num
from
(
-- move duplicate using distinct
select distinct ID,cast(DATE as date)new_date from Table
)X
)Y
where num<=30
In Oracle this will be:
SELECT * FROM TEST_DATE1
WHERE DATEUPDT > SYSDATE - 30;
select * from MyTable
where
[Date]>=dateadd(d, -30, getdate());
To group by ID and perform aggregation, something like this
select ID,
count(*) row_count,
max(Latitude) max_lat,
max(Longitude) max_long
from MyTable
where
[Date]>=dateadd(d, -30, getdate())
group by ID;

Segregate the row based on the date time column per each month

I have the following table in sql server database environment.
the format of start date MM/DD/YYYY.
I need the result to be like the following table.
based on start date column the record should segregated to each month in the period between start date and end date
You can use a recursive CTE:
with cte as (
select id, startdate as dte, enddate
from t
union all
select id,
dateadd(day, 1, eomonth(dte)),
enddate
from t
where eomonth(dte) < enddate
)
select id, dte,
lead(dte, 1, enddate) over (partition by id order by dte)
from cte;
Thank you Gordon Linoff
Using CTE I have got the following result
My code
WITH cte
AS (SELECT 1 AS id,
Cast('2010-01-20' AS DATE) AS trg,
Cast('2010-01-20' AS DATE) AS strt_dte,
Cast('2010-03-15' AS DATE) AS end_dte
UNION ALL
SELECT id,
Dateadd(day, 1, Eomonth (trg)),
strt_dte,
end_dte
FROM cte
WHERE Eomonth(trg) < end_dte)
SELECT id,
trg,
strt_dte,
end_dte,
Lead (trg, 1, end_dte)
OVER (
partition BY id
ORDER BY trg) AS lead_result
FROM cte

How to replace the loop in MsSQL?

For example
If I want to check in every day last week
select count(ID) from DB where date < "2019/07/01"
select count(ID) from DB where date < "2019/07/02"
select count(ID) from DB where date < "2019/07/03"
...
select count(ID) from DB where date < "2019/07/08"
like
0701 10
0702 15
0703 23
...
0707 45
How to do this without loop and one query?
You can generate the dates using a recursive CTE (or other method) and then run the query:
with dates as (
select convert(date, '2019-07-01') as dte union all
select dateadd(day, 1, dte)
from dates
where dte < '2019-07-08'
)
select d.dte,
(select count(*) from DB where DB.date < d.dte)
from dates d;
More efficient, though, is a cumulative sum:
select db.*
from (select date, count(*) as cnt, sum(count(*)) over (order by date) as running_cnt
from db
group by date
) d
where d.date >= '2019-07-01' and d.date < '2019-07-09';
Are you just counting the number by day?
Something like
SELECT MONTH(date), DAY(date), COUNT(ID)
FROM DB
GROUP BY MONTH(date), DAY(date);
(assuming date is a DATE or DATETIME)
Do it with window Count. range between current row and current row selects exactly this day rows.
select distinct date, count(1) over (order by Date) - count(1) over (order by Date range between current row and current row)
from DB
where date between '2019-07-01' and '2019-07-08';
I assume date column is exactly DATE.

How can I count users in a month that were not present in the month before?

I am trying to count unique users on a monthly basis that were not present in the previous month. So if a user has a record for January and then another one for February, then I would only count January for that user.
user_id time
a1 1/2/17
a1 2/10/17
a2 2/18/17
a4 2/5/17
a5 3/25/17
My results should look like this
Month User Count
January 1
February 2
March 1
I'm not really familiar with BigQuery, but here's how I would solve the problem using TSQL. I imagine that you'd be able to use similar logic in BigQuery.
1). Order the data by user_id first, and then time. In TSQL, you can accomplish this with the following and store it in a common table expression, which you will query in the step after this.
;WITH cte AS
(
select ROW_NUMBER() OVER (PARTITION BY [user_id] ORDER BY [time]) AS rn,*
from dbo.employees
)
2). Next query for only the rows with rn = 1 (the first occurrence for a particular user) and group by the month.
select DATENAME(month, [time]) AS [Month], count(*) AS user_count
from cte
where rn = 1
group by DATENAME(month, [time])
This is assuming that 2017 is the only year you're dealing with. If you're dealing with more than one year, you probably want step #2 to look something like this:
select year([time]) as [year], DATENAME(month, [time]) AS [month],
count(*) AS user_count
from cte
where rn = 1
group by year([time]), DATENAME(month, [time])
First aggregate by the user id and the month. Then use lag() to see if the user was present in the previous month:
with du as (
select date_trunc(time, month) as yyyymm, user_id
from t
group by date_trunc(time, month)
)
select yyyymm, count(*)
from (select du.*,
lag(yyyymm) over (partition by user_id order by yyyymm) as prev_yyyymm
from du
) du
where prev_yyyymm is not null or
prev_yyyymm < date_add(yyyymm, interval 1 month)
group by yyyymm;
Note: This uses the date functions, but similar functions exist for timestamp.
The way I understood question is - to exclude user to be counted in given month only if same user presented in previous month. But if same user present in few months before given, but not in previous - user should be counted.
If this is correct - Try below for BigQuery Standard SQL
#standardSQL
SELECT Year, Month, COUNT(DISTINCT user_id) AS User_Count
FROM (
SELECT *,
DATE_DIFF(time, LAG(time) OVER(PARTITION BY user_id ORDER BY time), MONTH) AS flag
FROM (
SELECT
user_id,
DATE_TRUNC(PARSE_DATE('%x', time), MONTH) AS time,
EXTRACT(YEAR FROM PARSE_DATE('%x', time)) AS Year,
FORMAT_DATE('%B', PARSE_DATE('%x', time)) AS Month
FROM yourTable
GROUP BY 1, 2, 3, 4
)
)
WHERE IFNULL(flag, 0) <> 1
GROUP BY Year, Month, time
ORDER BY time
you can test / play with above using below example with dummy data from your question
#standardSQL
WITH yourTable AS (
SELECT 'a1' AS user_id, '1/2/17' AS time UNION ALL
SELECT 'a1', '2/10/17' UNION ALL
SELECT 'a2', '2/18/17' UNION ALL
SELECT 'a4', '2/5/17' UNION ALL
SELECT 'a5', '3/25/17'
)
SELECT Year, Month, COUNT(DISTINCT user_id) AS User_Count
FROM (
SELECT *,
DATE_DIFF(time, LAG(time) OVER(PARTITION BY user_id ORDER BY time), MONTH) AS flag
FROM (
SELECT
user_id,
DATE_TRUNC(PARSE_DATE('%x', time), MONTH) AS time,
EXTRACT(YEAR FROM PARSE_DATE('%x', time)) AS Year,
FORMAT_DATE('%B', PARSE_DATE('%x', time)) AS Month
FROM yourTable
GROUP BY 1, 2, 3, 4
)
)
WHERE IFNULL(flag, 0) <> 1
GROUP BY Year, Month, time
ORDER BY time
The output is
Year Month User_Count
2017 January 1
2017 February 2
2017 March 1
Try this query:
SELECT
t1.d,
count(DISTINCT t1.user_id)
FROM
(
SELECT
EXTRACT(MONTH FROM time) AS d,
--EXTRACT(MONTH FROM time)-1 AS d2,
user_id
FROM nbitra.tmp
) t1
LEFT JOIN
(
SELECT
EXTRACT(MONTH FROM time) AS d,
user_id
FROM nbitra.tmp
) t2
ON t1.d = t2.d+1
WHERE
(
t1.user_id <> t2.user_id --User is in previous month
OR t2.user_id IS NULL --To handle january, since there is no previous month to compare to
)
GROUP BY t1.d;

Google BigQuery: Rolling Count Distinct

I have a table with is simply a list of dates and user IDs (not aggregated).
We define a metric called active users for a given date by counting the distinct number of IDs that appear in the previous 45 days.
I am trying to run a query in BigQuery that, for each day, returns the day plus the number of active users for that day (count distinct user from 45 days ago until today).
I have experimented with window functions, but can't figure out how to define a range based on the date values in a column. Instead, I believe the following query would work in a database like MySQL, but does not in BigQuery.
SELECT
day,
(SELECT
COUNT(DISTINCT visid)
FROM daily_users
WHERE day BETWEEN DATE_ADD(t.day, -45, "DAY") AND t.day
) AS active_users
FROM daily_users AS t
GROUP BY 1
This doesn't work in BigQuery: "Subselect not allowed in SELECT clause."
How to do this in BigQuery?
BigQuery documentation claims that count(distinct) works as a window function. However, that doesn't help you, because you are not looking for a traditional window frame.
One method would adds a record for each date after a visit:
select theday, count(distinct visid)
from (select date_add(u.day, n.n, "day") as theday, u.visid
from daily_users u cross join
(select 1 as n union all select 2 union all . . .
select 45
) n
) u
group by theday;
Note: there may be simpler ways to generate a series of 45 integers in BigQuery.
Below should work with BigQuery
#legacySQL
SELECT day, active_users FROM (
SELECT
day,
COUNT(DISTINCT id)
OVER (ORDER BY ts RANGE BETWEEN 45*24*3600 PRECEDING AND CURRENT ROW) AS active_users
FROM (
SELECT day, id, TIMESTAMP_TO_SEC(TIMESTAMP(day)) AS ts
FROM daily_users
)
) GROUP BY 1, 2 ORDER BY 1
Above assumes that day field is represented as '2016-01-10' format.
If it is not a case , you should adjust TIMESTAMP_TO_SEC(TIMESTAMP(day)) in most inner select
Also please take a look at COUNT(DISTINC) specifics in BigQuery
Update for BigQuery Standard SQL
#standardSQL
SELECT
day,
(SELECT COUNT(DISTINCT id) FROM UNNEST(active_users) id) AS active_users
FROM (
SELECT
day,
ARRAY_AGG(id)
OVER (ORDER BY ts RANGE BETWEEN 3888000 PRECEDING AND CURRENT ROW) AS active_users
FROM (
SELECT day, id, UNIX_DATE(PARSE_DATE('%Y-%m-%d', day)) * 24 * 3600 AS ts
FROM daily_users
)
)
GROUP BY 1, 2
ORDER BY 1
You can test / play with it using below dummy sample
#standardSQL
WITH daily_users AS (
SELECT 1 AS id, '2016-01-10' AS day UNION ALL
SELECT 2 AS id, '2016-01-10' AS day UNION ALL
SELECT 1 AS id, '2016-01-11' AS day UNION ALL
SELECT 3 AS id, '2016-01-11' AS day UNION ALL
SELECT 1 AS id, '2016-01-12' AS day UNION ALL
SELECT 1 AS id, '2016-01-12' AS day UNION ALL
SELECT 1 AS id, '2016-01-12' AS day UNION ALL
SELECT 1 AS id, '2016-01-13' AS day
)
SELECT
day,
(SELECT COUNT(DISTINCT id) FROM UNNEST(active_users) id) AS active_users
FROM (
SELECT
day,
ARRAY_AGG(id)
OVER (ORDER BY ts RANGE BETWEEN 86400 PRECEDING AND CURRENT ROW) AS active_users
FROM (
SELECT day, id, UNIX_DATE(PARSE_DATE('%Y-%m-%d', day)) * 24 * 3600 AS ts
FROM daily_users
)
)
GROUP BY 1, 2
ORDER BY 1