How to repeat same value based on month date in Bigquery - google-bigquery

anyone can help me doing this in bigquery? So i have 2 table like this
01/01/2000
01/02/2000
01/03/2000
01/04/2000
and this
start | end | status
01/01/2000 | 01/02/2000 | a
01/02/2000 | 01/06/2000 | b
i want them become like this
month | status
01/01/2000 | a
01/02/2000 | b
01/03/2000 | b
01/04/2000 | b

You can handle this with a between in the join. The only caveat is how to handle the overlap of dates. In this case I've subtracted a day from the end to not include it in the range.
with temp1 as(
select '01/01/2000' dt UNION ALL
select '01/02/2000' UNION ALL
select '01/03/2000' UNION ALL
select '01/04/2000'
),
temp2 as (
select '01/01/2000' start, '01/02/2000' end_dt, 'a' status UNION ALL
select '01/02/2000' start, '01/06/2000' end_dt, 'b' status
)
select *
from temp1
join temp2
on parse_date('%d/%m/%Y',temp1.dt) between parse_date('%d/%m/%Y',temp2.start) and date_add(parse_date('%d/%m/%Y', temp2.end_dt), interval -1 day)

Below is for BigQuery Standard SQL
#standardSQL
select month, status
from `project.dataset.tableA`
join `project.dataset.tableB`
on parse_date('%d/%m/%Y', month) >= parse_date('%d/%m/%Y', start)
and parse_date('%d/%m/%Y', month) < parse_date('%d/%m/%Y', `end`)
If to apply to sample data in your question as in below example
#standardSQL
with `project.dataset.tableA` as (
select '01/01/2000' month union all
select '01/02/2000' union all
select '01/03/2000' union all
select '01/04/2000'
), `project.dataset.tableB` as (
select '01/01/2000' start, '01/02/2000' `end`, 'a' status union all
select '01/02/2000', '01/06/2000', 'b'
)
select month, status
from `project.dataset.tableA`
join `project.dataset.tableB`
on parse_date('%d/%m/%Y', month) >= parse_date('%d/%m/%Y', start)
and parse_date('%d/%m/%Y', month) < parse_date('%d/%m/%Y', `end`)
output is
And btw, since mid October 2020 - BigQuery standard SQL supports DATE arithmetic operators. So, below will also work
#standardSQL
select month, status
from `project.dataset.tableA`
join `project.dataset.tableB`
on parse_date('%d/%m/%Y', month)
between parse_date('%d/%m/%Y', start)
and parse_date('%d/%m/%Y', `end`) - 1

Related

create time range with 2 columns date_time

The problem I am facing is how to find distinct time periods from multiple time periods with overlap in Teradata ANSI SQL.
For example, the attached tables contain multiple overlapping time periods, how can I combine those time periods into 3 unique time periods in Teradata SQL???
I think I can do it in python with the loop function, but not sure how to do it in SQL
ID
Start Date
End Date
001
2005-01-01
2006-01-01
001
2005-01-01
2007-01-01
001
2008-01-01
2008-06-01
001
2008-04-01
2008-12-01
001
2010-01-01
2010-05-01
001
2010-04-01
2010-12-01
001
2010-11-01
2012-01-01
My expected result is:
ID
start_Date
end_date
001
2005-01-01
2007-01-01
001
2008-01-01
2008-12-01
001
2010-01-01
2012-01-01
From Oracle 12, you can use MATCH_RECOGNIZE to perform a row-by-row comparison:
SELECT *
FROM table_name
MATCH_RECOGNIZE(
PARTITION BY id
ORDER BY start_date
MEASURES
FIRST(start_date) AS start_date,
MAX(end_date) AS end_date
ONE ROW PER MATCH
PATTERN (overlapping_ranges* last_range)
DEFINE overlapping_ranges AS NEXT(start_date) <= MAX(end_date)
)
Which, for the sample data:
CREATE TABLE table_name (ID, Start_Date, End_Date) AS
SELECT '001', DATE '2005-01-01', DATE '2006-01-01' FROM DUAL UNION ALL
SELECT '001', DATE '2005-01-01', DATE '2007-01-01' FROM DUAL UNION ALL
SELECT '001', DATE '2008-01-01', DATE '2008-06-01' FROM DUAL UNION ALL
SELECT '001', DATE '2008-04-01', DATE '2008-12-01' FROM DUAL UNION ALL
SELECT '001', DATE '2010-01-01', DATE '2010-05-01' FROM DUAL UNION ALL
SELECT '001', DATE '2010-04-01', DATE '2010-12-01' FROM DUAL UNION ALL
SELECT '001', DATE '2010-11-01', DATE '2012-01-01' FROM DUAL;
Outputs:
ID
START_DATE
END_DATE
001
2005-01-01 00:00:00
2007-01-01 00:00:00
001
2008-01-01 00:00:00
2008-12-01 00:00:00
001
2010-01-01 00:00:00
2012-01-01 00:00:00
db<>fiddle here
Update: Alternative query
SELECT id,
start_date,
end_date
FROM (
SELECT id,
dt,
SUM(cnt) OVER (PARTITION BY id ORDER BY dt) AS grp,
cnt
FROM (
SELECT ID,
dt,
SUM(type) OVER (PARTITION BY id ORDER BY dt, ROWNUM) * type AS cnt
FROM table_name
UNPIVOT (dt FOR type IN (start_date AS 1, end_date AS -1))
)
WHERE cnt IN (1,0)
)
PIVOT (MAX(dt) FOR cnt IN (1 AS start_date, 0 AS end_date))
Or, an equivalent that does not use UNPIVOT, PIVOT or ROWNUM and works in both Oracle and PostgreSQL:
SELECT id,
MAX(CASE cnt WHEN 1 THEN dt END) AS start_date,
MAX(CASE cnt WHEN 0 THEN dt END) AS end_date
FROM (
SELECT id,
dt,
SUM(cnt) OVER (PARTITION BY id ORDER BY dt) AS grp,
cnt
FROM (
SELECT ID,
dt,
SUM(type) OVER (PARTITION BY id ORDER BY dt, rn) * type AS cnt
FROM (
SELECT r.*,
ROW_NUMBER() OVER (PARTITION BY id ORDER BY dt ASC, type DESC) AS rn
FROM (
SELECT id, 1 AS type, start_date AS dt FROM table_name
UNION ALL
SELECT id, -1 AS type, end_date AS dt FROM table_name
) r
) p
) s
WHERE cnt IN (1,0)
) t
GROUP BY id, grp
Update 2: Another Alternative
SELECT id,
MIN(start_date) AS start_date,
MAX(end_Date) AS end_date
FROM (
SELECT t.*,
SUM(CASE WHEN start_date <= prev_max THEN 0 ELSE 1 END)
OVER (PARTITION BY id ORDER BY start_date) AS grp
FROM (
SELECT t.*,
MAX(end_date) OVER (
PARTITION BY id ORDER BY start_date
ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
) AS prev_max
FROM table_name t
) t
) t
GROUP BY id, grp
db<>fiddle Oracle PostgreSQL
This is a gaps and islands problem. Try this:
with u as
(select ID, start_date, end_date,
case
when start_date <= lag(end_date) over(partition by ID order by start_date, end_date) then 0
else 1 end as grp
from table_name),
v as
(select ID, start_date, end_date,
sum(grp) over(partition by ID order by start_date, end_date) as island
from u)
select ID, min(start_date) as start_Date, max(end_date) as end_date
from v
group by ID, island;
Fiddle
Basically you can identify "islands" by comparing start_date of current row to end_date of previous row (ordered by start_date, end_date), if it precedes it then it's the same island. Then you can do a rolling sum() to get the island numbers. Finally select min(start_date) and max(end_date) from each island to get the desired output.
This may work ,with little bit of change in function , I tried it in Dbeaver :
select ID,Start_Date,End_Date
from
(
select t.*,
dense_rank () over(partition by extract (year from Start_Date) order BY End_Date desc) drnk
from testing_123 t
) temp
where temp.drnk = 1
ORDER BY Start_Date;
Try this
WITH a as (
SELECT
ID,
LEFT(Start_Date, 4) as Year,
MIN(Start_Date) as New_Start_Date
FROM
TAB1
GROUP BY
ID,
LEFT(Start_Date, 4)
), b as (
SELECT
a.ID,
Year,
New_Start_Date,
End_Date
FROM
a
LEFT JOIN
TAB1
ON LEFT(a.New_Start_Date, 4) = LEFT(TAB1.Start_Date, 4)
)
select
ID,
New_Start_Date as Start_Date,
MAX(End_Date)
from
b
GROUP BY
ID,
New_Start_Date;
Example: https://dbfiddle.uk/?rdbms=mysql_8.0&fiddle=97f91b68c635aebfb752538cdd752ace

Getting a Monthly Date Range

How can you make a date range in a big query? A date range starts from 29th of the month and ends with 28th of the next month. It should be like this
Date | Starting Date | Ending Date
03-13-2020 | 02-29-2020 | 03-28-2021
06-30-2020 | 06-29-2020 | 07-28-2021
01-01-2021 | 12-29-2020 | 01-28-2021
11-11-2021 | 10-28-2021 | 11-29-2021
Actually, i make an article on it.
Check this out:
https://www.theaccountingtactics.com/2021/12/BigQueryBQ-DateProblems-DateSituations-that-are-Hard-to-Analyze-and-Takes-Time-ToCrack%20.html?m=1
Consider below approach
create temp function set_day(date date, day int64) as (
ifnull(
safe.date(extract(year from date), extract(month from date), day),
last_day(date)
)
);
select Date,
set_day(Starting_Date, 29) as Starting_Date,
set_day(Ending_Date, 28) as Ending_Date
from (
select *, if(extract(day from Date) < 29,
struct(date_sub(Date, interval 1 month) as Starting_Date, Date as Ending_Date),
struct(Date as Starting_Date, date_add(Date, interval 1 month) as Ending_Date)
).*
from your_table
)
if applied to sample data as in your question
with your_table as (
select date '2020-03-13' Date union all
select '2021-03-13' union all
select '2020-06-30' union all
select '2021-01-01' union all
select '2021-11-11'
)
output is
You can test whole stuff using below
create temp function set_day(date date, day int64) as (
ifnull(
safe.date(extract(year from date), extract(month from date), day),
last_day(date)
)
);
with your_table as (
select date '2020-03-13' Date union all
select '2021-03-13' union all
select '2020-06-30' union all
select '2021-01-01' union all
select '2021-11-11'
)
select Date,
set_day(Starting_Date, 29) as Starting_Date,
set_day(Ending_Date, 28) as Ending_Date
from (
select *, if(extract(day from Date) < 29,
struct(date_sub(Date, interval 1 month) as Starting_Date, Date as Ending_Date),
struct(Date as Starting_Date, date_add(Date, interval 1 month) as Ending_Date)
).*
from your_table
)

How do I calculate the Rolling Average for the difference of days between events on BigQuery?

I have a table of events that goes like this:
date event_category event_planner
2019-09-22T00:00:00 soccer_night Marcus
2019-09-25T00:00:00 comedy_night John
2019-09-28T00:00:00 dance_party John
2019-10-02T00:00:00 soccer_night Marcus
The idea here is to get the rolling average of the difference between dates for every planner.
So far I have the distance in days for each planner separated by category with the following:
DATE_DIFF(SAFE_CAST(date AS date),LAG(SAFE_CAST(date AS date)) OVER (PARTITION BY event_category, event_planner ORDER BY date), day) AS result
What I expect is something like this:
date event_category event_planner rolling_avg
2019-09-22T00:00:00 soccer_night Marcus 0
2019-09-25T00:00:00 comedy_night John 0
2019-09-28T00:00:00 comedy_night John 3
2019-10-02T00:00:00 soccer_night Marcus 10
2019-10-10T00:00:00 comedy_night John 7
Below is for BigQuery Standard SQL
#standardSQL
SELECT * EXCEPT(day, diff), IFNULL(AVG(diff) OVER(PARTITION BY event_category, event_planner ORDER BY day), 0) rolling_avg
FROM (
SELECT *, DATE_DIFF(day, LAG(day) OVER(PARTITION BY event_category, event_planner ORDER BY day), DAY) diff
FROM (
SELECT *, SAFE_CAST(date AS DATE) AS day
FROM `project.dataset.table`
)
)
If to apply to sample data from your question
WITH `project.dataset.table` AS (
SELECT TIMESTAMP '2019-09-22T00:00:00' date, 'soccer_night' event_category, 'Marcus' event_planner UNION ALL
SELECT '2019-09-25T00:00:00', 'comedy_night', 'John' UNION ALL
SELECT '2019-09-28T00:00:00', 'comedy_night', 'John' UNION ALL
SELECT '2019-10-02T00:00:00', 'soccer_night', 'Marcus' UNION ALL
SELECT '2019-10-10T00:00:00', 'comedy_night', 'John'
)
result is
Row date event_category event_planner rolling_avg
1 2019-09-22 00:00:00 UTC soccer_night Marcus 0
2 2019-09-25 00:00:00 UTC comedy_night John 0
3 2019-09-28 00:00:00 UTC comedy_night John 3.0
4 2019-10-02 00:00:00 UTC soccer_night Marcus 10.0
5 2019-10-10 00:00:00 UTC comedy_night John 7.5
How should I modify to use the average from the last three events of the same type by the same planner?
#standardSQL
SELECT * EXCEPT(day, diff),
IFNULL(AVG(diff) OVER(PARTITION BY event_category, event_planner ORDER BY day ROWS BETWEEN 2 PRECEDING AND CURRENT ROW), 0) rolling_avg
FROM (
SELECT *, DATE_DIFF(day, LAG(day) OVER(PARTITION BY event_category, event_planner ORDER BY day), DAY) diff
FROM (
SELECT *, SAFE_CAST(date AS DATE) AS day
FROM `project.dataset.table`
)
)
You could compute the last date in a subquery using lag(), and then do a rolling average in the outer query:
select
t.*,
avg(date_diff(date, lag_date, day)) over(
partition by event_category, event_planner order by date
) rolling_avg
from (
select
t.*
lag(date) over(
partition by event_category, event_planner order by date
) lag_date
from mytable t
) t
For the average you can use:
(DATE_DIFF(MIN(SAFE_CAST(date AS date)) OVER (PARTITION BY event_category, event_planner),
SAFE_CAST(date AS date),
day
) /
NULLIF(COUNT(*) OVER (PARTITION BY event_category, event_planner) - 1, 0)
) AS result

How to combine multiple SELECTs into a single SELECT by a common column in (BigQuery) SQL?

Given I have multiple tables in BigQuery, hence I have multiple SQL-statements that gives me "the number of X per day". For example:
SELECT FORMAT_TIMESTAMP("%F",timestamp) AS day, COUNT(*) as installs
FROM database.table1
GROUP BY day
ORDER BY day ASC
Which would give the result:
| day | installs |
-------------------------
| 2017-01-01 | 11 |
| 2017-01-02 | 22 |
etc
Another statement:
SELECT FORMAT_TIMESTAMP("%F",timestamp) AS day, COUNT(*) as uninstalls
FROM database.table2
GROUP BY day
ORDER BY day ASC
Which would give the result:
| day | uninstalls |
---------------------------
| 2017-01-02 | 22 |
| 2017-01-03 | 33 |
etc
Another statement:
SELECT FORMAT_TIMESTAMP("%F",timestamp) AS day, COUNT(*) as cases
FROM database.table3
GROUP BY day
ORDER BY day ASC
Which would give the result:
| day | cases |
----------------------
| 2017-01-01 | 11 |
| 2017-01-03 | 33 |
etc
etc
Now I need to combine all these into a single SELECT statement that gives the following results:
| day | installs | uninstalls | cases |
----------------------------------------------
| 2017-01-01 | 11 | 0 | 11 |
| 2017-01-02 | 22 | 22 | 0 |
| 2017-01-03 | 0 | 33 | 33 |
etc
Is this even possible?
Or what's the closest SQL-statement I can write that would give me a similar result?
Any feedback is appreciated!
Here is a self-contained example that might help to get you started. It uses two dummy tables, InstallEvents and UninstallEvents, which contain timestamps for the respective actions. It creates a common table expression called StartAndEnd that computes the minimum and maximum dates for these events in order to decide which dates to aggregate over, then unions the contents of the InstallEvents and UninstallEvents, counting the events for each day.
WITH InstallEvents AS (
SELECT TIMESTAMP_ADD('2017-01-01 00:00:00', INTERVAL x HOUR) AS timestamp
FROM UNNEST(GENERATE_ARRAY(0, 100)) AS x
),
UninstallEvents AS (
SELECT TIMESTAMP_ADD('2017-01-02 00:00:00', INTERVAL 2 * x HOUR) AS timestamp
FROM UNNEST(GENERATE_ARRAY(0, 50)) AS x
),
StartAndEnd AS (
SELECT MIN(DATE(timestamp)) AS min_date, MAX(DATE(timestamp)) AS max_date
FROM (
SELECT * FROM InstallEvents UNION ALL
SELECT * FROM UninstallEvents
)
)
SELECT
day,
COUNTIF(is_install AND DATE(timestamp) = day) AS installs,
COUNTIF(NOT is_install AND DATE(timestamp) = day) AS uninstalls
FROM (
SELECT *, true AS is_install
FROM InstallEvents UNION ALL
SELECT *, false
FROM UninstallEvents
)
CROSS JOIN UNNEST(GENERATE_DATE_ARRAY(
(SELECT min_date FROM StartAndEnd),
(SELECT max_date FROM StartAndEnd)
)) AS day
GROUP BY day
ORDER BY day;
If you know what the start and end dates are in advance, you can hard-code them in the query instead and then omit the StartAndEnd CTE:
WITH InstallEvents AS (
SELECT TIMESTAMP_ADD('2017-01-01 00:00:00', INTERVAL x HOUR) AS timestamp
FROM UNNEST(GENERATE_ARRAY(0, 100)) AS x
),
UninstallEvents AS (
SELECT TIMESTAMP_ADD('2017-01-02 00:00:00', INTERVAL 2 * x HOUR) AS timestamp
FROM UNNEST(GENERATE_ARRAY(0, 50)) AS x
)
SELECT
day,
COUNTIF(is_install AND DATE(timestamp) = day) AS installs,
COUNTIF(NOT is_install AND DATE(timestamp) = day) AS uninstalls
FROM (
SELECT *, true AS is_install
FROM InstallEvents UNION ALL
SELECT *, false
FROM UninstallEvents
)
CROSS JOIN UNNEST(GENERATE_DATE_ARRAY('2017-01-01', '2017-01-04')) AS day
GROUP BY day
ORDER BY day;
To see the events in the sample data, use a query that unions the contents:
WITH InstallEvents AS (
SELECT TIMESTAMP_ADD('2017-01-01 00:00:00', INTERVAL x HOUR) AS timestamp
FROM UNNEST(GENERATE_ARRAY(0, 100)) AS x
),
UninstallEvents AS (
SELECT TIMESTAMP_ADD('2017-01-02 00:00:00', INTERVAL 2 * x HOUR) AS timestamp
FROM UNNEST(GENERATE_ARRAY(0, 50)) AS x
)
SELECT timestamp, true AS is_install
FROM InstallEvents UNION ALL
SELECT timestamp, false
FROM UninstallEvents;
Below is for BigQuery Standard SQL
#standardSQL
WITH calendar AS (
SELECT day
FROM (
SELECT MIN(min_day) AS min_day, MAX(max_day) AS max_day
FROM (
SELECT MIN(DATE(timestamp)) AS min_day, MAX(DATE(timestamp)) AS max_day FROM `database.table1` UNION ALL
SELECT MIN(DATE(timestamp)) AS min_day, MAX(DATE(timestamp)) AS max_day FROM `database.table2` UNION ALL
SELECT MIN(DATE(timestamp)) AS min_day, MAX(DATE(timestamp)) AS max_day FROM `database.table3`
)
), UNNEST(GENERATE_DATE_ARRAY(min_day, max_day, INTERVAL 1 DAY)) AS day
)
SELECT
c.day AS day,
IFNULL(SUM(installs), 0) AS installs,
IFNULL(SUM(uninstalls), 0) AS uninstalls,
IFNULL(SUM(cases),0) AS cases
FROM calendar AS c
LEFT JOIN (SELECT DATE(timestamp) day, COUNT(1) installs FROM `database.table1` GROUP BY day) t1 ON t1.day = c.day
LEFT JOIN (SELECT DATE(timestamp) day, COUNT(1) uninstalls FROM `database.table2` GROUP BY day) t2 ON t2.day = c.day
LEFT JOIN (SELECT DATE(timestamp) day, COUNT(1) cases FROM `database.table3` GROUP BY day) t3 ON t3.day = c.day
GROUP BY day
HAVING installs + uninstalls + cases > 0
-- ORDER BY day
Please note: you are using timestamp as a column name which is not the best practice as it is keyword, so in my example i leave your naming but consider to change this!
You can test / play this solution with below dummy data
#standardSQL
WITH `database.table1` AS (
SELECT TIMESTAMP '2017-01-01' AS timestamp, 1 AS installs
UNION ALL SELECT TIMESTAMP '2017-01-01', 22
),
`database.table2` AS (
SELECT TIMESTAMP '2016-12-01' AS timestamp, 1 AS installs UNION ALL SELECT TIMESTAMP '2017-01-01', 22 UNION ALL SELECT TIMESTAMP '2017-01-01', 22 UNION ALL
SELECT TIMESTAMP '2017-01-02', 22 UNION ALL SELECT TIMESTAMP '2017-01-02', 22 UNION ALL SELECT TIMESTAMP '2017-01-02', 22 UNION ALL SELECT TIMESTAMP '2017-01-02', 22 UNION ALL SELECT TIMESTAMP '2017-01-02', 22
),
`database.table3` AS (
SELECT TIMESTAMP '2017-01-01' AS timestamp, 1 AS installs UNION ALL SELECT TIMESTAMP '2017-01-01', 22 UNION ALL SELECT TIMESTAMP '2017-01-01', 22 UNION ALL
SELECT TIMESTAMP '2017-01-10', 22 UNION ALL SELECT TIMESTAMP '2017-01-02', 22 UNION ALL SELECT TIMESTAMP '2017-01-02', 22 UNION ALL SELECT TIMESTAMP '2017-01-02', 22 UNION ALL SELECT TIMESTAMP '2017-01-02', 22
),
calendar AS (
SELECT day
FROM (
SELECT MIN(min_day) AS min_day, MAX(max_day) AS max_day
FROM (
SELECT MIN(DATE(timestamp)) AS min_day, MAX(DATE(timestamp)) AS max_day FROM `database.table1` UNION ALL
SELECT MIN(DATE(timestamp)) AS min_day, MAX(DATE(timestamp)) AS max_day FROM `database.table2` UNION ALL
SELECT MIN(DATE(timestamp)) AS min_day, MAX(DATE(timestamp)) AS max_day FROM `database.table3`
)
), UNNEST(GENERATE_DATE_ARRAY(min_day, max_day, INTERVAL 1 DAY)) AS day
)
SELECT
c.day AS day,
IFNULL(SUM(installs), 0) AS installs,
IFNULL(SUM(uninstalls), 0) AS uninstalls,
IFNULL(SUM(cases),0) AS cases
FROM calendar AS c
LEFT JOIN (SELECT DATE(timestamp) day, COUNT(1) installs FROM `database.table1` GROUP BY day) t1 ON t1.day = c.day
LEFT JOIN (SELECT DATE(timestamp) day, COUNT(1) uninstalls FROM `database.table2` GROUP BY day) t2 ON t2.day = c.day
LEFT JOIN (SELECT DATE(timestamp) day, COUNT(1) cases FROM `database.table3` GROUP BY day) t3 ON t3.day = c.day
GROUP BY day
HAVING installs + uninstalls + cases > 0
ORDER BY day
I am not very familiar with bigquery, so this is probably not going to be a copy-paste answer.
You'll first have to build a calander table to make sure you have all dates. Here's an example for sql server. There are probably examples for bigquery available as well. The following assumes a Calander table with Date attribute in timestamp.
Once you have your calander table you can join all your tables to that:
SELECT FORMAT_TIMESTAMP("%F",C.Date) AS day
, COUNT(T1.DATE(T1.TIMESTAMP)) AS installs --Here you could also use your FORMAT_TIMESTAMP
, COUNT(T1.DATE(T2.TIMESTAMP)) AS uninstalls
FROM Calander C
LEFT JOIN database.table1 T1
ON DATE(T1.TIMESTAMP) = DATE(C.Date) --Convert to date to remove times, you could also use your FORMAT_TIMESTAMP
LEFT JOIN database.table2 T2
ON DATE(T2.TIMESTAMP) = DATE(C.Date)
GROUP BY day
ORDER BY day ASC

Get data from last year

I am trying to get data from 1 year ago from the given date in bigquery.
date(DATE_ADD(date(DATE_ADD(timestamp(New_date),(DATEDIFF(timestamp(New_date), CURRENT_DATE())-1), "DAY")), -354, "DAY"))
The query above works, but when I try to put it in a case statement I get null.
sum(case when date(New_day) = date(DATE_ADD(date(DATE_ADD(timestamp(New_date),(DATEDIFF(timestamp(New_date), CURRENT_DATE())-1), "DAY")), -354, "DAY")) then 'this is working' else null end) as lastyeardata
How do I get data for 1 year ago on the same day to do a YoY comparison?
Edit #Mikhail's suggestions:
#standardSQl
WITH `yourTable` AS (
SELECT 1 AS id, '2017-08-14' AS New_date, '1234' AS volume UNION ALL
SELECT 2 AS id, '2017-08-13' AS New_date, '2345' AS volume UNION ALL
SELECT 3 AS id, '2017-08-14' AS New_date, '3456' AS volume UNION ALL
SELECT 4 AS id, '2017-08-14' AS New_date, '4567' AS volume UNION ALL
SELECT 5 AS id, '2016-08-14' AS New_date, '5678' AS volume UNION ALL
SELECT 6 AS id, '2016-08-13' AS New_date, '6789' AS volume UNION ALL
SELECT 7 AS id, '2016-08-12' AS New_date, '6789' AS volume UNION ALL
SELECT 8 AS id, '2016-08-11' AS New_date, '1011' AS volume
)
select
New_date
,volume as thisyeardata
,Case
WHEN PARSE_DATE('%Y-%m-%d', New_date) = DATE_ADD(PARSE_DATE('%Y-%m-%d', New_date), INTERVAL -1 YEAR)
THEN volume
ELSE NULL end as lastyearvolume
,DATE_ADD(PARSE_DATE('%Y-%m-%d', New_date), INTERVAL -1 YEAR) as lastyear
FROM `yourTable`
For some reason lastyearvolume is giving me null.
Below is for BigQuery Standard SQL (which is really recommended by BigQuery team to use versus Legacy one)
#standardSQl
WITH `yourTable` AS (
SELECT 1 AS id, '2017-08-14' AS New_date, '2016-08-14' AS New_day UNION ALL
SELECT 2 AS id, '2017-08-13' AS New_date, '2016-08-13' AS New_day UNION ALL
SELECT 3 AS id, '2017-08-14' AS New_date, '2016-08-12' AS New_day UNION ALL
SELECT 4 AS id, '2017-08-14' AS New_date, '2016-08-11' AS New_day
)
SELECT
COUNT(
CASE
WHEN PARSE_DATE('%Y-%m-%d', New_day) = DATE_ADD(PARSE_DATE('%Y-%m-%d', New_date), INTERVAL -1 YEAR)
THEN 'this is working'
ELSE NULL
END
) AS lastyeardata
FROM `yourTable`
As you would expect two rows are just counted as they have New_date and New_day year apart, the rest are not
Above example assumes your dates fields are of STRING type
If they are of Date Type - you just omit PARSE_DATE function
DATE_ADD has counterpart - DATE_SUB - so it can be used instead as DATE_SUB(PARSE_DATE('%Y-%m-%d', New_date), INTERVAL 1 YEAR)
Update based on your updated example:
simple and fast way to adjust your query t work:
#standardSQl
WITH `yourTable` AS (
SELECT 1 AS id, '2017-08-13' AS New_date, '1234' AS volume UNION ALL
SELECT 2 AS id, '2017-08-14' AS New_date, '2345' AS volume UNION ALL
SELECT 3 AS id, '2017-08-15' AS New_date, '3456' AS volume UNION ALL
SELECT 4 AS id, '2017-08-16' AS New_date, '4567' AS volume UNION ALL
SELECT 5 AS id, '2016-08-13' AS New_date, '5678' AS volume UNION ALL
SELECT 6 AS id, '2016-08-14' AS New_date, '6789' AS volume UNION ALL
SELECT 7 AS id, '2016-08-15' AS New_date, '6789' AS volume UNION ALL
SELECT 8 AS id, '2016-08-16' AS New_date, '1011' AS volume
)
SELECT
this_year.New_date
,this_year.volume AS thisyeardata
,last_year.volume AS lastyeardata
FROM `yourTable` AS this_year
JOIN `yourTable` AS last_year
ON PARSE_DATE('%Y-%m-%d', last_year.New_date) = DATE_ADD(PARSE_DATE('%Y-%m-%d', this_year.New_date), INTERVAL -1 YEAR)