Join Table to Itself for Last Year Results with Missing Dates - sql

I have a table of dates, channels, and sessions and I am trying to use a join command to add columns for each row containing the relevant value last year, however, I want to include the dates from last year that have no value this year and vice versa. The problem is that for dates that don't exist, I am getting doubling of the rows. Any thoughts on how to fix?
SELECT
ty.*,
ly.Date as Date_LY,
ly.Sessions as Sessions_LY
FROM
`testjoin` AS ty
FULL JOIN
`testjoin` as ly
ON
ly.Date = DATE_SUB(ty.Date, INTERVAl 1 YEAR)
AND ly.Channel = ty.Channel
Data:
Date Channel Sessions
01/01/2017 Email 5
02/02/2017 Email 10
01/01/2018 Email 11
02/02/2018 Email 17
01/01/2017 Organic 10
02/02/2017 Organic 15
01/01/2018 Organic 20
Desired Output:
Date Channel Sessions Sessions_LY
01/01/2017 Email 5 null
02/02/2017 Email 10 null
01/01/2018 Email 11 5
02/02/2018 Email 17 10
01/01/2017 Organic 10 null
02/02/2017 Organic 15 null
01/01/2018 Organic 20 10
02/02/2018 Organic null 15
Actual Output:
Date Channel Sessions Sessions_LY
01/01/2017 Organic 10
02/02/2017 Email 10
02/02/2017 Organic 15
01/01/2017 Email 5
01/01/2018 Email 11 5
01/01/2018 Organic 20 10
02/02/2018 Email 17 10
15
11
20
17

I think you want a cross join to generate the rows and a left join to bring in the values:
SELECT d.Date, c.Channel, ty.Sessions, ty_prev.Sessions
FROM (SELECT DISTINCT ty.Date
FROM testjoin ty
) d CROSS JOIN
(SELECT DISTINCT ty.channel FROM testjoin ty) c LEFT JOIN
testjoin ty
ON ty.Date = d.Date AND ty.Channel = c.Channel LEFT JOIN
testjoin ty_prev
ON ty_prev.Date = d.date - interval 1 year and ty.Channel = c.Channel;

Play with datepart as per your needs
with t (date, channel, sessions) as
(
select '01/01/2017', 'Email', 5 union all
select '02/02/2017', 'Email', 10 union all
select '01/01/2018', 'Email', 11 union all
select '02/02/2018', 'Email', 17 union all
select '01/01/2017', 'Organic', 10 union all
select '02/02/2017', 'Organic', 15 union all
select '01/01/2018', 'Organic', 20
)
select *, lag(sessions) over (partition by d.channel, datepart(mm, d.date) order by d.channel, datepart(mm, d.date)) l
from (select * from ((SELECT DISTINCT t.Date
FROM t) d
CROSS JOIN
(SELECT DISTINCT t.channel FROM t) c)) d left join t on d.Date = t.Date and d.channel = t.channel
order by d.channel, datepart(yyyy,d.date), datepart(mm, d.date)

Everything in your question indicates that you have only current(2018) and previous (2017) so below is based on this assumption and is for BigQuery Standard SQL
#standardSQL
WITH temp AS (
SELECT PARSE_DATE('%m/%d/%Y', Date) Date, Channel, Sessions
FROM `project.dataset.your_table`
), all_days AS (
SELECT Date, Channel FROM temp UNION DISTINCT
SELECT DATE_ADD(Date, INTERVAL 1 YEAR), Channel
FROM temp WHERE EXTRACT(YEAR FROM Date) = 2017
), all_data AS (
SELECT Date, Channel, Sessions, FORMAT_DATE('%m%d', Date) day
FROM all_days
LEFT JOIN temp USING(Date, Channel)
)
SELECT Date, Channel, Sessions,
LAG(Sessions) OVER(PARTITION BY day, Channel ORDER BY Date) Sessions_LY
FROM all_data
You can test / play with above using dummy data from your question as below
#standardSQL
WITH `project.dataset.your_table` AS (
SELECT '01/01/2017' Date, 'Email' Channel, 5 Sessions UNION ALL
SELECT '02/02/2017', 'Email', 10 UNION ALL
SELECT '01/01/2018', 'Email', 11 UNION ALL
SELECT '02/02/2018', 'Email', 17 UNION ALL
SELECT '01/01/2017', 'Organic', 10 UNION ALL
SELECT '02/02/2017', 'Organic', 15 UNION ALL
SELECT '01/01/2018', 'Organic', 20
), temp AS (
SELECT PARSE_DATE('%m/%d/%Y', Date) Date, Channel, Sessions
FROM `project.dataset.your_table`
), all_days AS (
SELECT Date, Channel FROM temp UNION DISTINCT
SELECT DATE_ADD(Date, INTERVAL 1 YEAR), Channel
FROM temp WHERE EXTRACT(YEAR FROM Date) = 2017
), all_data AS (
SELECT Date, Channel, Sessions, FORMAT_DATE('%m%d', Date) day
FROM all_days
LEFT JOIN temp USING(Date, Channel)
)
SELECT Date, Channel, Sessions,
LAG(Sessions) OVER(PARTITION BY day, Channel ORDER BY Date) Sessions_LY
FROM all_data
ORDER BY 2, 1
result is
Row Date Channel Sessions Sessions_LY
1 2017-01-01 Email 5 null
2 2017-02-02 Email 10 null
3 2018-01-01 Email 11 5
4 2018-02-02 Email 17 10
5 2017-01-01 Organic 10 null
6 2017-02-02 Organic 15 null
7 2018-01-01 Organic 20 10
8 2018-02-02 Organic null 15

Related

SQL : create intermediate data from date range

I have a table as shown here:
USER
ROI
DATE
1
5
2021-11-24
1
4
2021-11-26
1
6
2021-11-29
I want to get the ROI for the dates in between the other dates, expected result will be as below
From 2021-11-24 to 2021-11-30
USER
ROI
DATE
1
5
2021-11-24
1
5
2021-11-25
1
4
2021-11-26
1
4
2021-11-27
1
4
2021-11-28
1
6
2021-11-29
1
6
2021-11-30
You may use a calendar table approach here. Create a table containing all dates and then join with it. Sans an actual table, you may use an inline CTE:
WITH dates AS (
SELECT '2021-11-24' AS dt UNION ALL
SELECT '2021-11-25' UNION ALL
SELECT '2021-11-26' UNION ALL
SELECT '2021-11-27' UNION ALL
SELECT '2021-11-28' UNION ALL
SELECT '2021-11-29' UNION ALL
SELECT '2021-11-30'
),
cte AS (
SELECT USER, ROI, DATE, LEAD(DATE) OVER (ORDER BY DATE) AS NEXT_DATE
FROM yourTable
)
SELECT t.USER, t.ROI, d.dt
FROM dates d
INNER JOIN cte t
ON d.dt >= t.DATE AND (d.dt < t.NEXT_DATE OR t.NEXT_DATE IS NULL)
ORDER BY d.dt;

SQL - Query to return active subscriptions on a given day

I have a table that shows when a user signs up for a subscription and when their membership will expire. A user can purchase a new subscription even if their current one is in force.
userid|purchasedate|expirydate
1 |2019-01-01 |2019-02-01
2 |2019-01-02 |2019-02-02
3 |2019-01-03 |2019-02-03
3 |2019-01-04 |2019-03-03
I need a SQL query that will GROUP BY the date and return the number of active subscriptions on that date. So it would return:
date |count
2019-01-01|1
2019-01-02|2
2019-01-03|3
2019-01-04|3
Below is for BigQuery Standard SQL
#standardSQL
SELECT day, COUNT(DISTINCT userid) active_subscriptions
FROM (SELECT AS STRUCT MIN(purchasedate) min_date, MAX(expirydate) max_date FROM `project.dataset.table`),
UNNEST(GENERATE_DATE_ARRAY(min_date, max_date)) day
JOIN `project.dataset.table`
ON day BETWEEN purchasedate AND expirydate
GROUP BY day
You can test, play with above using dummy data from your question as in below example
#standardSQL
WITH `project.dataset.table` AS (
SELECT 1 userid, DATE '2019-01-01' purchasedate, DATE '2019-02-01' expirydate UNION ALL
SELECT 2, '2019-01-02', '2019-02-02' UNION ALL
SELECT 3, '2019-01-03', '2019-02-03' UNION ALL
SELECT 3, '2019-01-04', '2019-03-03'
)
SELECT day, COUNT(DISTINCT userid) active_subscriptions
FROM (SELECT AS STRUCT MIN(purchasedate) min_date, MAX(expirydate) max_date FROM `project.dataset.table`),
UNNEST(GENERATE_DATE_ARRAY(min_date, max_date)) day
JOIN `project.dataset.table`
ON day BETWEEN purchasedate AND expirydate
GROUP BY day
with below output
Row day active_subscriptions
1 2019-01-01 1
2 2019-01-02 2
3 2019-01-03 3
4 2019-01-04 3
5 2019-01-05 3
6 2019-01-06 3
... ... ...
... ... ...
31 2019-01-31 3
32 2019-02-01 3
33 2019-02-02 2
34 2019-02-03 1
35 2019-02-04 1
... ... ...
... ... ...
61 2019-03-02 1
62 2019-03-03 1
You need a list of dates and count(distinct):
select d.dte, count(distinct t.userid) as num_users
from (select distinct purchase_date as dte from t) d left join
t
on d.dte >= t.dte and
d.dte <= t.expiry_date
group by d.dte
order by d.dte;
EDIT:
BigQuery can be fickle about inequalities in the on clause. Here is another approach:
select dte, count(distinct t.userid) as num_users
from t cross join
unnest(generate_date_array(t.purchase_date, t.expiry_date, interval 1 day)) dte
group by dte
order by dte;
You can use a where clause to filter down to particular dates.
I make the table name 'test_expirydate' and use your data
and this one work
select
tb1.expirydate,
count(*) as total
from test_expirydate as tb1
left join (
select
expirydate
from test_expirydate as tb2
group by userid
) as tb2
on tb1.expirydate >= tb2.expirydate
group by tb1.expirydate
I don't sure is it work in other case or not but it fine with current data
Oh, I interpret that the left column should be the expiration date.

Incremental business day column that resets each month

I need to create a table that contains records with 1) all 365 days of the year and 2) a counter representing which business day of the month the day is. Non-business days should be represented with a 0. For example:
Date | Business Day
2019-10-01 1
2019-10-02 2
2019-10-03 3
2019-10-04 4
2019-10-05 0 // Saturday
2019-10-06 0 // Sunday
2019-10-07 5
....
2019-11-01 1
2019-11-02 0 // Saturday
2019-11-03 0 // Sunday
2019-11-04 2
So far, I've been able to create a table that contains all dates of the year.
CREATE TABLE ${TMPID}_days_of_the_year
(
`theDate` STRING
);
INSERT OVERWRITE TABLE ${TMPID}_days_of_the_year
select
dt_set.theDate
from
(
-- last 0~99 months
select date_sub('2019-12-31', a.s + 10*b.s + 100*c.s) as theDate
from
(
select 0 as s union all select 1 union all select 2 union all select 3 union all select 4 union all select 5 union all select 6 union all select 7 union all select 8 union all select 9
) a
cross join
(
select 0 as s union all select 1 union all select 2 union all select 3 union all select 4 union all select 5 union all select 6 union all select 7 union all select 8 union all select 9
) b
cross join
(
select 0 as s union all select 1 union all select 2 union all select 3
) c
) dt_set
where dt_set.theDate between '2019-01-01' and '2019-12-31'
order by dt_set.theDate DESC;
And I also have a table that contains all of the weekend days and holidays (this data is loaded from a file, and the date format is YYYY-MM-DD)
CREATE TABLE ${TMPID}_company_holiday
(
`holidayDate` STRING
)
;
LOAD DATA LOCAL INPATH '${FILE}' INTO TABLE ${TMPID}_company_holiday;
My question is.... how do I join these tables together while creating the business day counter column shown as in the sample data above?
You can use row_number() for the enumeration. This is a little tricky, because it needs to be conditional, but the information you need is provided by a left join:
select dy.*,
(case when ch.holiday_date is null
then row_number() over (partition by trunc(dy.date, 'MONTH'), ch.holiday_date
order by dy.date
)
else 0
end) as business_day
from days_of_the_year dy left join
company_holiday ch
on dy.date = ch.holiday_date;

Frequency Distribution by Day

I have records of No. of calls coming to a call center. When a call comes into a call center a ticket is open.
So, let's say ticket 1 (T1) is open on 8/1/19 and it stays open till 8/5/19. So, if a person ran a query everyday then on 8/1 it will show 1 ticket open...same think on day 2 till day 5....I want to get records by day to see how many tickets were open for each day.....
In short, Frequency Distribution by Day.
Ticket Open_date Close_date
T1 8/1/2019 8/5/2019
T2 8/1/2019 8/6/2019
Result:
Result
Date # Tickets_Open
8/1/2019 2
8/2/2019 2
8/3/2019 2
8/4/2019 2
8/5/2019 2
8/6/2019 1
8/7/2019 0
8/8/2019 0
8/9/2019 0
8/10/2019 0
We can handle your requirement via the use of a calendar table, which stores all dates covering the full range in your data set.
WITH dates AS (
SELECT '2019-08-01' AS dt UNION ALL
SELECT '2019-08-02' UNION ALL
SELECT '2019-08-03' UNION ALL
SELECT '2019-08-04' UNION ALL
SELECT '2019-08-05' UNION ALL
SELECT '2019-08-06' UNION ALL
SELECT '2019-08-07' UNION ALL
SELECT '2019-08-08' UNION ALL
SELECT '2019-08-09' UNION ALL
SELECT '2019-08-10'
)
SELECT
d.dt,
COUNT(t.Open_date) AS num_tickets_open
FROM dates d
LEFT JOIN tickets t
ON d.dt BETWEEN t.Open_date AND t.Close_date
GROUP BY
d.dt;
Note that in practice if you expect to have this reporting requirement in the long term, you might want to replace the dates CTE above with a bona-fide table of dates.
This solution generates the list of dates from the tickets table using CTE recursion and calculates the count:
WITH Tickets(Ticket, Open_date, Close_date) AS
(
SELECT "T1", "8/1/2019", "8/5/2019"
UNION ALL
SELECT "T2", "8/1/2019", "8/6/2019"
),
Ticket_dates(Ticket, Dates) as
(
SELECT t1.Ticket, CONVERT(DATETIME, t1.Open_date)
FROM Tickets t1
UNION ALL
SELECT t1.Ticket, DATEADD(dd, 1, CONVERT(DATETIME, t1.Dates))
FROM Ticket_dates t1
inner join Tickets t2 on t1.Ticket = t2.Ticket
where DATEADD(dd, 1, CONVERT(DATETIME, t1.Dates)) <= CONVERT(DATETIME, t2.Close_date)
)
SELECT CONVERT(varchar, Dates, 1), count(*)
FROM Ticket_dates
GROUP by Dates
ORDER by Dates
A "general purpose" trick is to generate a series of numbers, which can be done using CTE's but there are many alternatives, and from that create the needed range of dates. Once that exists then you can left join your ticket data to this and then count by date.
CREATE TABLE mytable(
Ticket VARCHAR(8) NOT NULL PRIMARY KEY
,Open_date DATE NOT NULL
,Close_date DATE NOT NULL
);
INSERT INTO mytable(Ticket,Open_date,Close_date) VALUES ('T1','8/1/2019','8/5/2019');
INSERT INTO mytable(Ticket,Open_date,Close_date) VALUES ('T2','8/1/2019','8/6/2019');
Also note I am using a cross apply in this example to "attach" the min and max dates of your tickets to each numbered row. You would need to include your own logic on what data to select here.
;WITH
cteDigits AS (
SELECT 0 AS digit UNION ALL SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL
SELECT 5 UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9
)
, cteTally AS (
SELECT
[1s].digit
+ [10s].digit * 10
+ [100s].digit * 100 /* add more like this as needed */
AS num
FROM cteDigits [1s]
CROSS JOIN cteDigits [10s]
CROSS JOIN cteDigits [100s] /* add more like this as needed */
)
select
n.num + 1 rownum
, dateadd(day,n.num,ca.min_date) as on_date
, count(t.Ticket) as tickets_open
from cteTally n
cross apply (select min(Open_date), max(Close_date) from mytable) ca (min_date, max_date)
left join mytable t on dateadd(day,n.num,ca.min_date) between t.Open_date and t.Close_date
where dateadd(day,n.num,ca.min_date) <= ca.max_date
group by
n.num + 1
, dateadd(day,n.num,ca.min_date)
order by
rownum
;
result:
+--------+---------------------+--------------+
| rownum | on_date | tickets_open |
+--------+---------------------+--------------+
| 1 | 01.08.2019 00:00:00 | 2 |
| 2 | 02.08.2019 00:00:00 | 2 |
| 3 | 03.08.2019 00:00:00 | 2 |
| 4 | 04.08.2019 00:00:00 | 2 |
| 5 | 05.08.2019 00:00:00 | 2 |
| 6 | 06.08.2019 00:00:00 | 1 |
+--------+---------------------+--------------+

Categorize website visitors starting from the first occasion, based on if condition

Could you please help me with sql statement, preferreby it should work in big query. I have 3 columns userid, date, hostname. I need to create additional column - client_type on the following condition: when userid first time comes to hostname = "online-store.com" then from this date on client_type for this particular userid will be always "current_client" else "visitor".
For example, in the image (link attached) we have userid = 1 and 4 who had become "current client". User 4 was just a visitor, but after visiting hostname = "online-store.com" he will be always classified as "current client".enter image description here
Below is for BigQuery Standard SQL
#standardSQL
SELECT
userid, date, hostname,
IF(0 = COUNTIF(hostname = 'online-store.com') OVER(
PARTITION BY userid ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
), 'visitor', 'current_client') client_type
FROM `project.dataset.table`
You can test, play with above using dummy data you provided in your question
#standardSQL
WITH `project.dataset.table` AS (
SELECT 1 userid, DATE '2018-02-01' date, 'online-store.com' hostname UNION ALL
SELECT 2, '2018-02-01', 'other' UNION ALL
SELECT 3, '2018-02-01', 'other' UNION ALL
SELECT 4, '2018-02-01', 'other' UNION ALL
SELECT 1, '2018-02-01', 'other' UNION ALL
SELECT 1, '2018-04-07', 'other' UNION ALL
SELECT 4, '2018-04-08', 'online-store.com' UNION ALL
SELECT 5, '2018-04-08', 'other' UNION ALL
SELECT 6, '2018-04-08', 'other' UNION ALL
SELECT 4, '2018-04-08', 'other' UNION ALL
SELECT 8, '2018-04-08', 'other' UNION ALL
SELECT 1, '2018-07-07', 'other' UNION ALL
SELECT 1, '2018-11-22', 'online-store.com'
)
SELECT
userid, date, hostname,
IF(0 = COUNTIF(hostname = 'online-store.com') OVER(
PARTITION BY userid ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
), 'visitor', 'current_client') client_type
FROM `project.dataset.table`
ORDER BY date
with result
Row userid date hostname client_type
1 1 2018-02-01 online-store.com current_client
2 1 2018-02-01 other current_client
3 2 2018-02-01 other visitor
4 3 2018-02-01 other visitor
5 4 2018-02-01 other visitor
6 1 2018-04-07 other current_client
7 4 2018-04-08 online-store.com current_client
8 4 2018-04-08 other current_client
9 5 2018-04-08 other visitor
10 6 2018-04-08 other visitor
11 8 2018-04-08 other visitor
12 1 2018-07-07 other current_client
13 1 2018-11-22 online-store.com current_client
This should be good:
#standardSQL
with userdates as (
select userid, hostname, min(date) as mindate from `dataset.table` where hostname = 'online-store.com' group by userid, hostname
)
select u.userid, u.date, u.hostname, case when u.date >= ud.mindate then 'current_user' else 'visitor' end as client_type
from `dataset.table` u
left outer join userdates ud on u.userid = ud.userid
order by 1, 2