Pro Rata in BigQuery [duplicate] - sql

This question already has answers here:
SQL equally distribute value by rows
(2 answers)
Do loop in BigQuery
(1 answer)
Closed 1 year ago.
I want to pro rate a table like this:
into a table like this:
essentially I want to create rows for the days between date_start and date end and then divide spend by how many days there are.
I am currently using the query below to do this, using BigQuery scripting - I know this probably is a horrible way of querying this but I'm not sure how else to do it. It takes about 30 seconds to run this query for just 3 rows.
DECLARE i INT64 DEFAULT 1;
DECLARE n int64;
SET n = (SELECT COUNT(*) FROM `pro_rata_test.data`);
DELETE FROM `pro_rata_test.pro_rata` WHERE TRUE;
WHILE i <= n DO
INSERT INTO
pro_rata_test.pro_rata
SELECT
day,
country,
campaign,
other,
SUM(spend)/(
SELECT
DATETIME_DIFF(DATETIME(TIMESTAMP(date_end)),
DATETIME(TIMESTAMP(date_start)),
DAY) + 1
FROM (
SELECT *, ROW_NUMBER() OVER(ORDER BY date_start) AS rn FROM `pro_rata_test.data`)
WHERE
rn = i) AS spend
FROM (
SELECT *, ROW_NUMBER() OVER(ORDER BY date_start) AS rn FROM `pro_rata_test.data`),
UNNEST(GENERATE_DATE_ARRAY(date_start, date_end)) day
WHERE
rn = i
GROUP BY
day,
country,
campaign,
other
ORDER BY
day;
SET
i = i + 1;
END WHILE

Try generate_date_array and unnest:
with mytable as (
select date '2021-01-01' as date_start, date '2021-01-10' as date_end, 100 as spend, 'FR' as country, 'Campaign1' as campaign, 'test1' as Other union all
select date '2021-01-11', date '2021-02-27', 150, 'UK', 'Campaign1', 'test2' union all
select date '2021-03-20', date '2021-04-20', 500, 'UK', 'Campaign2', 'test2'
)
select
day,
country,
campaign,
other,
spend/(date_diff(date_end, date_start, day)+1) as spend
from mytable, unnest(generate_date_array(date_start, date_end)) as day
order by day

Related

greenplum string_agg conversion into hivesql supported

We are migrating greenplum sql query to hivesql and please find below statement available, string_agg. how do we migrate, kindly help us. below sample greenplum code needed for migration hive.
select string_agg(Display_String, ';' order by data_day )
select string_agg(Display_String, ';' order by data_day )
from
(
select data_day,
sum(revenue)/1000000.00 as revenue,
data_day||' '||trim(to_char(sum(revenue),'9,999,999,999')) as Display_String
from(
select case when data_date = current_date then 'D:'
when data_date = current_date - 1 then ' D-01:'
when data_date = current_date - 2 then ' D-02:'
when data_date = current_date - 7 then ' D-07:'
when data_date = current_date - 28 then ' D-28:'
end data_day, revenue/1000000.00 revenue
from test.testable
where data_date between current_date - 28 and current_date and hour <=(Select hour from ( select row_number() over(order by hour desc) iRowsID, hour from test.testable where data_date = current_date and type = 'UVC')tbl1
where irowsid = 2) and type in( 'UVC')
order by 1 desc) a
group by 1)aa;
There is nothing like this in hive. However you can use collect list and partition by/Order by to calculate it.
select concat_ws(';', max(concat_str))
FROM (
SELECT collect_list(Display_String) over (order by data_day ) concat_str
FROM
(your above SQL) s ) concat_qry)r
Explanation -
collect list concats the string and while doing it it, order by orders data on day column.
Outermost MAX() will pickup max data for the concatenated string.
Pls note this is a very slow operation. Test performance as well before implementing it.
Here is a sample SQL and result to help you.
select
id, concat_ws(';', max(concat_str))
from
( select
s.id, collect_list(s.c) over (partition by s.id order by s.c ) concat_str
from
( select 1 id,'ax' c union
select 1,'b'
union select 2,'f'
union select 2,'g'
union all select 1,'b'
union all select 1,'b' )s
) gs
group by id

how to calculate difference between dates in BigQuery

I have a table named Employees with Columns: PersonID, Name, StartDate. I want to calculate 1) difference in days between the newest and oldest employee and 2) the longest period of time (in days) without any new hires. I have tried to use DATEDIFF, however the dates are in a single column and I'm not sure what other method I should use. Any help would be greatly appreciated
Below is for BigQuery Standard SQL
#standardSQL
SELECT
SUM(days_before_next_hire) AS days_between_newest_and_oldest_employee,
MAX(days_before_next_hire) - 1 AS longest_period_without_new_hire
FROM (
SELECT
DATE_DIFF(
StartDate,
LAG(StartDate) OVER(ORDER BY StartDate),
DAY
) days_before_next_hire
FROM `project.dataset.your_table`
)
You can test, play with above using dummy data as in the example below
#standardSQL
WITH `project.dataset.your_table` AS (
SELECT DATE '2019-01-01' StartDate UNION ALL
SELECT '2019-01-03' StartDate UNION ALL
SELECT '2019-01-13' StartDate
)
SELECT
SUM(days_before_next_hire) AS days_between_newest_and_oldest_employee,
MAX(days_before_next_hire) - 1 AS longest_period_without_new_hire
FROM (
SELECT
DATE_DIFF(
StartDate,
LAG(StartDate) OVER(ORDER BY StartDate),
DAY
) days_before_next_hire
FROM `project.dataset.your_table`
)
with result
Row days_between_newest_and_oldest_employee longest_period_without_new_hire
1 12 9
Note use of -1 in calculating longest_period_without_new_hire - it is really up to you to use this adjustment or not depends on your preferences of counting gaps
1) difference in days between the newest and oldest record
WITH table AS (
SELECT DATE(created_at) date, *
FROM `githubarchive.day.201901*`
WHERE _table_suffix<'2'
AND repo.name = 'google/bazel-common'
AND type='ForkEvent'
)
SELECT DATE_DIFF(MAX(date), MIN(date), DAY) max_minus_min
FROM table
2) the longest period of time (in days) without any new records
WITH table AS (
SELECT DATE(created_at) date, *
FROM `githubarchive.day.201901*`
WHERE _table_suffix<'2'
AND repo.name = 'google/bazel-common'
AND type='ForkEvent'
)
SELECT MAX(diff) max_diff
FROM (
SELECT DATE_DIFF(date, LAG(date) OVER(ORDER BY date), DAY) diff
FROM table
)

Compare array of datetime objects and pick all rows where difference between each and the next is less than 7 days

My table looks like this:
(can't post images yet)
I want to select all names from my table where the time difference between each of the datetime objects and the next is always more than 7 days.
So from the above I would get only Paul, since Adam's first two times are already only a day apart.
The best I can come up with is to get the time difference between the smallest and largest datetime in the array and then divide by array_length(datetime). So basically the average time all datetime objects, but that's not helping me.
I'm using Standard SQL on BigQuery
SELECT name
FROM dataset.table
WHERE NOT EXISTS(
SELECT 1 FROM UNNEST(datetime) AS dt WITH OFFSET off
WHERE DATETIME_DIFF(
datetime[SAFE_OFFSET(off - 1)], dt, DAY
) <= 7
)
This compares each entry in the array with the one after it, looking for any where the number of days is 7 or less.
You can use unnest():
select t.*
from t
where not exists (select 1
from (select dt, lag(dt) over (order by dt) as prev_dt
from unnest(datetime) dt
) x
where dt < datetime_add(prev_dt, interval 7 day
);
It is still not clear what exactly the schema of your data: based on layout - it looks like datetime is an array, but based on data type you show in the image - it could be just regular field, so below cover both cases (for BigQuery Standard SQL)
Case 1 - repeated field
#standardSQL
SELECT name
FROM `project.dataset.table`
WHERE 7 < (
SELECT DATETIME_DIFF(
datetime,
LAG(datetime) OVER(PARTITION BY name ORDER BY datetime),
DAY) distance
FROM UNNEST(datetime) datetime
ORDER BY IFNULL(distance, 777)
LIMIT 1
)
you can test, play with it using dummy data as below
#standardSQL
WITH `project.dataset.table` AS (
SELECT 'Adam' name,
[DATETIME '2018-07-26T17:55:03',
'2018-07-27T17:55:03',
'2018-06-29T17:55:03',
'2018-07-16T17:55:03',
'2018-08-19T17:55:03',
'2018-07-14T17:55:03'] datetime UNION ALL
SELECT 'Paul', [DATETIME '2018-08-26T17:55:03',
'2018-08-18T17:55:03',
'2018-06-20T17:55:03',
'2018-08-09T17:55:03',
'2018-07-16T17:55:03']
)
SELECT name
FROM `project.dataset.table`
WHERE 7 < (
SELECT DATETIME_DIFF(
datetime,
LAG(datetime) OVER(PARTITION BY name ORDER BY datetime),
DAY) distance
FROM UNNEST(datetime) datetime
ORDER BY IFNULL(distance, 777)
LIMIT 1
)
Case 2 - regular (not repeated field)
#standardSQL
SELECT name FROM (
SELECT name,
DATETIME_DIFF(
datetime,
LAG(datetime) OVER(PARTITION BY name ORDER BY datetime),
DAY
) distance
FROM `project.dataset.table`
)
GROUP BY name
HAVING MIN(distance) > 7
Dummy data example below:
#standardSQL
WITH `project.dataset.table` AS (
SELECT 'Adam' name, DATETIME '2018-07-26T17:55:03' datetime UNION ALL
SELECT 'Adam', '2018-07-27T17:55:03' UNION ALL
SELECT 'Adam', '2018-06-29T17:55:03' UNION ALL
SELECT 'Adam', '2018-07-16T17:55:03' UNION ALL
SELECT 'Adam', '2018-08-19T17:55:03' UNION ALL
SELECT 'Adam', '2018-07-14T17:55:03' UNION ALL
SELECT 'Paul', '2018-08-26T17:55:03' UNION ALL
SELECT 'Paul', '2018-08-18T17:55:03' UNION ALL
SELECT 'Paul', '2018-06-20T17:55:03' UNION ALL
SELECT 'Paul', '2018-08-09T17:55:03' UNION ALL
SELECT 'Paul', '2018-07-16T17:55:03'
)
SELECT name FROM (
SELECT name,
DATETIME_DIFF(
datetime,
LAG(datetime) OVER(PARTITION BY name ORDER BY datetime),
DAY
) distance
FROM `project.dataset.table`
)
GROUP BY name
HAVING MIN(distance) > 7
both return same result
Row name
1 Paul

How can I count users in a month that were not present in the month before?

I am trying to count unique users on a monthly basis that were not present in the previous month. So if a user has a record for January and then another one for February, then I would only count January for that user.
user_id time
a1 1/2/17
a1 2/10/17
a2 2/18/17
a4 2/5/17
a5 3/25/17
My results should look like this
Month User Count
January 1
February 2
March 1
I'm not really familiar with BigQuery, but here's how I would solve the problem using TSQL. I imagine that you'd be able to use similar logic in BigQuery.
1). Order the data by user_id first, and then time. In TSQL, you can accomplish this with the following and store it in a common table expression, which you will query in the step after this.
;WITH cte AS
(
select ROW_NUMBER() OVER (PARTITION BY [user_id] ORDER BY [time]) AS rn,*
from dbo.employees
)
2). Next query for only the rows with rn = 1 (the first occurrence for a particular user) and group by the month.
select DATENAME(month, [time]) AS [Month], count(*) AS user_count
from cte
where rn = 1
group by DATENAME(month, [time])
This is assuming that 2017 is the only year you're dealing with. If you're dealing with more than one year, you probably want step #2 to look something like this:
select year([time]) as [year], DATENAME(month, [time]) AS [month],
count(*) AS user_count
from cte
where rn = 1
group by year([time]), DATENAME(month, [time])
First aggregate by the user id and the month. Then use lag() to see if the user was present in the previous month:
with du as (
select date_trunc(time, month) as yyyymm, user_id
from t
group by date_trunc(time, month)
)
select yyyymm, count(*)
from (select du.*,
lag(yyyymm) over (partition by user_id order by yyyymm) as prev_yyyymm
from du
) du
where prev_yyyymm is not null or
prev_yyyymm < date_add(yyyymm, interval 1 month)
group by yyyymm;
Note: This uses the date functions, but similar functions exist for timestamp.
The way I understood question is - to exclude user to be counted in given month only if same user presented in previous month. But if same user present in few months before given, but not in previous - user should be counted.
If this is correct - Try below for BigQuery Standard SQL
#standardSQL
SELECT Year, Month, COUNT(DISTINCT user_id) AS User_Count
FROM (
SELECT *,
DATE_DIFF(time, LAG(time) OVER(PARTITION BY user_id ORDER BY time), MONTH) AS flag
FROM (
SELECT
user_id,
DATE_TRUNC(PARSE_DATE('%x', time), MONTH) AS time,
EXTRACT(YEAR FROM PARSE_DATE('%x', time)) AS Year,
FORMAT_DATE('%B', PARSE_DATE('%x', time)) AS Month
FROM yourTable
GROUP BY 1, 2, 3, 4
)
)
WHERE IFNULL(flag, 0) <> 1
GROUP BY Year, Month, time
ORDER BY time
you can test / play with above using below example with dummy data from your question
#standardSQL
WITH yourTable AS (
SELECT 'a1' AS user_id, '1/2/17' AS time UNION ALL
SELECT 'a1', '2/10/17' UNION ALL
SELECT 'a2', '2/18/17' UNION ALL
SELECT 'a4', '2/5/17' UNION ALL
SELECT 'a5', '3/25/17'
)
SELECT Year, Month, COUNT(DISTINCT user_id) AS User_Count
FROM (
SELECT *,
DATE_DIFF(time, LAG(time) OVER(PARTITION BY user_id ORDER BY time), MONTH) AS flag
FROM (
SELECT
user_id,
DATE_TRUNC(PARSE_DATE('%x', time), MONTH) AS time,
EXTRACT(YEAR FROM PARSE_DATE('%x', time)) AS Year,
FORMAT_DATE('%B', PARSE_DATE('%x', time)) AS Month
FROM yourTable
GROUP BY 1, 2, 3, 4
)
)
WHERE IFNULL(flag, 0) <> 1
GROUP BY Year, Month, time
ORDER BY time
The output is
Year Month User_Count
2017 January 1
2017 February 2
2017 March 1
Try this query:
SELECT
t1.d,
count(DISTINCT t1.user_id)
FROM
(
SELECT
EXTRACT(MONTH FROM time) AS d,
--EXTRACT(MONTH FROM time)-1 AS d2,
user_id
FROM nbitra.tmp
) t1
LEFT JOIN
(
SELECT
EXTRACT(MONTH FROM time) AS d,
user_id
FROM nbitra.tmp
) t2
ON t1.d = t2.d+1
WHERE
(
t1.user_id <> t2.user_id --User is in previous month
OR t2.user_id IS NULL --To handle january, since there is no previous month to compare to
)
GROUP BY t1.d;

Google BigQuery: Rolling Count Distinct

I have a table with is simply a list of dates and user IDs (not aggregated).
We define a metric called active users for a given date by counting the distinct number of IDs that appear in the previous 45 days.
I am trying to run a query in BigQuery that, for each day, returns the day plus the number of active users for that day (count distinct user from 45 days ago until today).
I have experimented with window functions, but can't figure out how to define a range based on the date values in a column. Instead, I believe the following query would work in a database like MySQL, but does not in BigQuery.
SELECT
day,
(SELECT
COUNT(DISTINCT visid)
FROM daily_users
WHERE day BETWEEN DATE_ADD(t.day, -45, "DAY") AND t.day
) AS active_users
FROM daily_users AS t
GROUP BY 1
This doesn't work in BigQuery: "Subselect not allowed in SELECT clause."
How to do this in BigQuery?
BigQuery documentation claims that count(distinct) works as a window function. However, that doesn't help you, because you are not looking for a traditional window frame.
One method would adds a record for each date after a visit:
select theday, count(distinct visid)
from (select date_add(u.day, n.n, "day") as theday, u.visid
from daily_users u cross join
(select 1 as n union all select 2 union all . . .
select 45
) n
) u
group by theday;
Note: there may be simpler ways to generate a series of 45 integers in BigQuery.
Below should work with BigQuery
#legacySQL
SELECT day, active_users FROM (
SELECT
day,
COUNT(DISTINCT id)
OVER (ORDER BY ts RANGE BETWEEN 45*24*3600 PRECEDING AND CURRENT ROW) AS active_users
FROM (
SELECT day, id, TIMESTAMP_TO_SEC(TIMESTAMP(day)) AS ts
FROM daily_users
)
) GROUP BY 1, 2 ORDER BY 1
Above assumes that day field is represented as '2016-01-10' format.
If it is not a case , you should adjust TIMESTAMP_TO_SEC(TIMESTAMP(day)) in most inner select
Also please take a look at COUNT(DISTINC) specifics in BigQuery
Update for BigQuery Standard SQL
#standardSQL
SELECT
day,
(SELECT COUNT(DISTINCT id) FROM UNNEST(active_users) id) AS active_users
FROM (
SELECT
day,
ARRAY_AGG(id)
OVER (ORDER BY ts RANGE BETWEEN 3888000 PRECEDING AND CURRENT ROW) AS active_users
FROM (
SELECT day, id, UNIX_DATE(PARSE_DATE('%Y-%m-%d', day)) * 24 * 3600 AS ts
FROM daily_users
)
)
GROUP BY 1, 2
ORDER BY 1
You can test / play with it using below dummy sample
#standardSQL
WITH daily_users AS (
SELECT 1 AS id, '2016-01-10' AS day UNION ALL
SELECT 2 AS id, '2016-01-10' AS day UNION ALL
SELECT 1 AS id, '2016-01-11' AS day UNION ALL
SELECT 3 AS id, '2016-01-11' AS day UNION ALL
SELECT 1 AS id, '2016-01-12' AS day UNION ALL
SELECT 1 AS id, '2016-01-12' AS day UNION ALL
SELECT 1 AS id, '2016-01-12' AS day UNION ALL
SELECT 1 AS id, '2016-01-13' AS day
)
SELECT
day,
(SELECT COUNT(DISTINCT id) FROM UNNEST(active_users) id) AS active_users
FROM (
SELECT
day,
ARRAY_AGG(id)
OVER (ORDER BY ts RANGE BETWEEN 86400 PRECEDING AND CURRENT ROW) AS active_users
FROM (
SELECT day, id, UNIX_DATE(PARSE_DATE('%Y-%m-%d', day)) * 24 * 3600 AS ts
FROM daily_users
)
)
GROUP BY 1, 2
ORDER BY 1