How to do a partitioned outer join in BigQuery - sql

I would like to implement the partitioned outer join in BigQuery. To give a concrete example, I'd like to achieve the partitioned outer join as the accepted answer here: https://dba.stackexchange.com/questions/227069/what-is-a-partitioned-outer-join
I understand there are a lot of discussions about this topic, but I can't make it work under BigQuery. I added partition by date after the left table following the same syntax in the answer as follows:
select * from (
select '2019-01-17' as date, 'London' as location, 11 as qty
union all
select '2019-01-15' as date, 'London' as location, 10 as qty
union all
select '2019-01-16' as date, 'Paris' as location, 20 as qty
union all
select '2019-01-17' as date, 'Boston' as location, 31 as qty
union all
select '2019-01-16' as date, 'Boston' as location, 30 as qty
) as sales partition by (date)
right join
(
select 'London' as location
union all
select 'Paris' as location
union all
select 'Boston' as location
)
as loc
using (location)
The target result I'm looking for is:
date qty location
15-JAN-19 NULL Boston
15-JAN-19 10 London
15-JAN-19 NULL Paris
16-JAN-19 30 Boston
16-JAN-19 NULL London
16-JAN-19 20 Paris
17-JAN-19 31 Boston
17-JAN-19 11 London
17-JAN-19 NULL Paris
But I got the following error: Syntax error: Unexpected keyword PARTITION at [11:12]
How can I implement it in BigQuery?

Below is for BigQuery Standard SQL
#standardSQL
SELECT `date`, qty, location
FROM (SELECT DISTINCT `date` FROM sales)
CROSS JOIN loc
LEFT JOIN sales
USING (`date`, location)
You can test, play with above using sample data from your question as in below example
#standardSQL
WITH sales AS (
SELECT '2019-01-17' AS `date`, 'London' AS location, 11 AS qty UNION ALL
SELECT '2019-01-15', 'London', 10 UNION ALL
SELECT '2019-01-16', 'Paris', 20 UNION ALL
SELECT '2019-01-17', 'Boston', 31 UNION ALL
SELECT '2019-01-16', 'Boston', 30
), loc AS (
SELECT 'London' AS location UNION ALL
SELECT 'Paris' UNION ALL
SELECT 'Boston'
)
SELECT `date`, qty, location
FROM (SELECT DISTINCT `date` FROM sales)
CROSS JOIN loc
LEFT JOIN sales
USING (`date`, location)
-- ORDER BY `date`, location
with below result
Row date qty location
1 2019-01-15 null Boston
2 2019-01-15 10 London
3 2019-01-15 null Paris
4 2019-01-16 30 Boston
5 2019-01-16 null London
6 2019-01-16 20 Paris
7 2019-01-17 31 Boston
8 2019-01-17 11 London
9 2019-01-17 null Paris
In case if you need dates to be in 15-JAN-19 format - you below
#standardSQL
SELECT FORMAT_DATE('%d-%b-%y', CAST(`date` AS DATE)) AS `date`, qty, location
FROM (SELECT DISTINCT `date` FROM sales)
CROSS JOIN loc
LEFT JOIN sales
USING (`date`, location)
so result will be
Row date qty location
1 15-Jan-19 null Boston
2 15-Jan-19 10 London
3 15-Jan-19 null Paris
4 16-Jan-19 30 Boston
5 16-Jan-19 null London
6 16-Jan-19 20 Paris
7 17-Jan-19 31 Boston
8 17-Jan-19 11 London
9 17-Jan-19 null Paris

Related

List the branch that monthly pays the most in salaries

I have this table, the expected output should be B003 since it's pays 54,000
STAFF
SALARY
BRAN
SL21
30000
B005
SG37
12000
B003
SG14
18000
B003
SA9
9000
B007
SG5
24000
B003
SL41
9000
B005
So far I only have this subquery, which isn't working how I expected.
SELECT BRANCHNO
FROM STAFF
WHERE (SALARY) IN (SELECT MAX(SUM(SALARY))
FROM STAFF
GROUP BY BRANCHNO);
This works but I want a subquery that returns the branchno
SELECT MAX(SUM(SALARY))
FROM STAFF
GROUP BY BRANCHNO;
select BRANCHNO max(sum_sal)
from (SELECT BRANCHNO, SUM(SALARY) sum_sal
FROM STAFF
GROUP BY BRANCHNO) q1
group by BRANCHNO ;
The column used to group the rows can be displayed. So, add BRANCHNO to your select clause.
One option is to use rank analytic function which ranks branches by sum of their salaries in descending order; you'd then return the one(s) that rank as the highest (rnk = 1).
Sample data:
SQL> with staff (staff, salary, bran) as
2 (select 'SL21', 30000, 'B005' from dual union all
3 select 'SG37', 12000, 'B003' from dual union all
4 select 'SG14', 18000, 'B003' from dual union all
5 select 'SA9' , 9000, 'B007' from dual union all
6 select 'SG5' , 24000, 'B003' from dual union all
7 select 'SL41', 9000, 'B005' from dual
8 )
Query:
9 select bran
10 from (select bran, rank() over (order by sum(salary) desc) rnk
11 from staff
12 group by bran
13 )
14 where rnk = 1;
BRAN
----
B003
SQL>

Group by rows which are in sequence

Consider I have a table like this
PASSENGER CITY DATE
43 NEW YORK 1-Jan-21
44 CHICAGO 4-Jan-21
43 NEW YORK 2-Jan-21
43 NEW YORK 3-Jan-21
44 ROME 5-Jan-21
43 LONDON 4-Jan-21
44 CHICAGO 6-Jan-21
44 CHICAGO 7-Jan-21
How would I group Passenger and City column in sequence to get a result like below?
PASSENGER CITY COUNT
43 NEW YORK 3
44 CHICAGO 1
44 ROME 1
43 LONDON 1
44 CHICAGO 2
One way to deal with such a gaps-and-islands problem is to calculate a ranking for the gaps.
Then group also on that ranking.
SELECT PASSENGER, CITY
, COUNT(*) AS "Count"
-- , MIN("DATE") AS StartDate
-- , MAX("DATE") AS EndDate
FROM (
SELECT q1.*
, SUM(gap) OVER (PARTITION BY PASSENGER ORDER BY "DATE") as Rnk
FROM (
SELECT PASSENGER, CITY, "DATE"
, CASE
WHEN 1 = TRUNC("DATE")
- TRUNC(LAG("DATE")
OVER (PARTITION BY PASSENGER, CITY ORDER BY "DATE"))
THEN 0 ELSE 1 END as gap
FROM table_name t
) q1
) q2
GROUP BY PASSENGER, CITY, Rnk
ORDER BY MIN("DATE"), PASSENGER
PASSENGER
CITY
Count
43
NEW YORK
3
43
LONDON
1
44
CHICAGO
1
44
ROME
1
44
CHICAGO
2
db<>fiddle here
From Oracle 12, you can use MATCH_RECOGNIZE:
SELECT *
FROM table_name
MATCH_RECOGNIZE (
PARTITION BY passenger
ORDER BY "DATE"
MEASURES
FIRST(city) AS city,
COUNT(*) AS count
PATTERN (same_city+)
DEFINE
same_city AS FIRST(city) = city
);
Which, for the sample data:
CREATE TABLE table_name (PASSENGER, CITY, "DATE") AS
SELECT 43, 'NEW YORK', DATE '2021-01-01' FROM DUAL UNION ALL
SELECT 44, 'CHICAGO', DATE '2021-01-04' FROM DUAL UNION ALL
SELECT 43, 'NEW YORK', DATE '2021-01-02' FROM DUAL UNION ALL
SELECT 43, 'NEW YORK', DATE '2021-01-03' FROM DUAL UNION ALL
SELECT 44, 'ROME', DATE '2021-01-05' FROM DUAL UNION ALL
SELECT 43, 'LONDON', DATE '2021-01-04' FROM DUAL UNION ALL
SELECT 44, 'CHICAGO', DATE '2021-01-06' FROM DUAL UNION ALL
SELECT 44, 'CHICAGO', DATE '2021-01-07' FROM DUAL
Outputs:
PASSENGER
CITY
COUNT
43
NEW YORK
3
43
LONDON
1
44
CHICAGO
1
44
ROME
1
44
CHICAGO
2
If you have ordered the input result set (note: tables should be considered to be unordered) and want to maintain the order then:
SELECT *
FROM (SELECT t.*, ROWNUM AS rn FROM table_name t)
MATCH_RECOGNIZE (
PARTITION BY passenger
ORDER BY RN
MEASURES
FIRST(rn) AS rn,
FIRST("DATE") AS "DATE",
FIRST(city) AS city,
COUNT(*) AS count
PATTERN (same_city+)
DEFINE
same_city AS FIRST(city) = city
)
ORDER BY rn
Outputs:
PASSENGER
RN
DATE
CITY
COUNT
43
1
01-JAN-21
NEW YORK
3
44
2
04-JAN-21
CHICAGO
1
44
5
05-JAN-21
ROME
1
43
6
04-JAN-21
LONDON
1
44
7
06-JAN-21
CHICAGO
2
db<>fiddle here

SQL Joining transactions on Date Range

In SQL Server 2014, I'm working with two tables, an EMPLOYEE and a SALES table:
EMPID EMPNAME HIRE_DATE
---------------------------
1234 JOHN SMITH 2021-05-01
1235 JANE DOE 2021-08-05
1236 JANE SMITH 2021-07-31
EMPID SALE_DATE PRODUCT
-------------------------------------
1234 2021-05-05 VPN
1234 2021-05-10 VPN Basic
1234 2021-07-15 Cloud Storage Bronze
1234 2021-07-05 Cloud Storage Gold
1235 2021-10-01 Antivirus
I need to write a query that will produce all rows/columns from the EMPLOYEE table, with a column showing their (aggregated) sales, but ONLY sales that were triggered within 30 days of the hire date.
This query works, but will pull in ALL sales completed until present:
SELECT EMP.*, SALES_30_DAYS
FROM EMP
LEFT JOIN
(SELECT EMPID, COUNT(*)
FROM SALES_30_DAYS
GROUP BY EMPID) ON EMP.EMPID = SALES.EMPID
In this other attempt, HIRE_DATE is not recognized in the sub-query.
SELECT EMP.*, SALES_30_DAYS
FROM EMP
LEFT JOIN
(SELECT EMPID, COUNT(*) SALES_30_DAYS
FROM SALES
WHERE DATEDIFF(DD, HIRE_DATE, SALE_DATE) < 30
GROUP BY EMPID) ON EMP.EMPID= SALES.EMPID
How can I re-write this query, so that the second table will provide the aggregated sales ONLY if the sale took place up to 30 days after the hire date?
Desired outcome:
EMPID EMPNAME HIRE_DATE SALES_30_DAYS
-----------------------------------------
1234 JOHN SMITH 2021-05-01 2
1235 JANE DOE 2021-08-05 1
1236 JANE SMITH 2021-07-31 NULL
WITH EMPLOYEES(EMPID, EMPNAME, HIRE_DATE)AS
(
SELECT 1234, 'JOHN SMITH', '2021-05-01' UNION ALL
SELECT 1235, 'JANE DOE' , '2021-08-05' UNION ALL
SELECT 1236, 'JANE SMITH' ,'2021-07-31'
),
SALES(EMPID, SALE_DATE, PRODUCT) AS
(
SELECT 1234, '2021-05-05' ,'VPN' UNION ALL
SELECT 1234 , '2021-05-10' ,'VPN Basic' UNION ALL
SELECT 1234 , '2021-07-15' ,'Cloud Storage Bronze' UNION ALL
SELECT 1234 , '2021-07-05' ,'Cloud Storage Gold' UNION ALL
SELECT 1235 , '2021-10-01', 'Antivirus'
)
SELECT E.EMPID,E.EMPNAME,E.HIRE_DATE,SALE_QUERY.CNTT
FROM EMPLOYEES E
OUTER APPLY
(
SELECT COUNT(*)CNTT
FROM SALES AS S WHERE E.EMPID=S.EMPID AND
S.SALE_DATE BETWEEN E.HIRE_DATE AND DATEADD(DD,30,E.HIRE_DATE)
)SALE_QUERY
Could you please try if the above is suitable for you

How to filter my table based on this specific date criteria?

I am using SQL Server 2014. Below is an extract of Table t1:
rownum RoomID ArrivalDate DepartureDate Name GuestID
1 287 2020-01-01 2020-01-09 John 600
2 451 2020-01-09 2020-01-10 John 600
3 458 2020-01-09 2020-01-10 John 600
1 240 2020-03-19 2020-03-21 Alan 112
2 159 2020-03-21 2020-03-22 Alan 112
1 400 2020-05-01 2020-05-10 Joe 225
2 155 2020-06-13 2020-06-18 Joe 225
1 200 2020-07-01 2020-07-08 Smith 980
2 544 2020-07-08 2020-07-10 Smith 980
3 428 2020-09-01 2020-09-05 Smith 980
...
The problem: I need to filter this table so that the output gives me only those rows of a guest where the difference/s between his ArrivalDate (at rownum 2 or 3 or 4...) and his DepartureDate (at rownum =1) is greater than 0.
To simplify: If we take Guest John, his ArrivalDate for rownum=2 and rownum=3 are both the same as his DepartureDate for rownum=1; therefore I want to exclude him completely in my output. Same for Guest Allan. However, for Guest Smith only where the rownum=2 needs to be excluded.
Note: all guests in this table will have at least a rownum=2 (that is, a minimum of 2 entries).
My expected output:
rownum RoomID ArrivalDate DepartureDate Name GuestID
1 400 2020-05-01 2020-05-10 Joe 225
2 155 2020-06-13 2020-06-18 Joe 225
1 200 2020-07-01 2020-07-08 Smith 980
3 428 2020-09-01 2020-09-05 Smith 980
I am stuck on how to write the logic behind this filter. Any help would be appreciated.
The trick here appears to be keeping the first row when you there is a match -- but not including any rows otherwise. You can use window functions:
select t.*
from (select t.*,
max(case when rownum = 1 then departuredate end) over (partition by guestid) as departuredate_1,
max(case when rownum <> 1 then arrivaldate end) over (partition by guestid) as arrivaldate_not_1
from t1 t
) t
where (arrivaldate_not_1 > departuredate_1) and
(rownum = 1 or arrivaldate > departuredate_1);
Here is a db<>fiddle.
Please use below query and confirm if this is what you are expecting,
select * from table where (ArrivalDate, Name) not in
(select DepartureDate, Name from table);
create table #Aridept
(
rownum int,
RoomID int,
ArrivalDate date,
DepartureDate date,
Name varchar(20),
GuestID int
)
insert into #Aridept
select 1 , 287 , '2020-01-01', '2020-01-09', 'John', 600
union all select 2 , 451 , '2020-01-09', '2020-01-10','John' , 600
union all select 3 , 458 , '2020-01-09', '2020-01-10','John', 600
union all select 1 , 240 , '2020-03-19', '2020-03-21','Alan', 112
union all select 2 , 159 , '2020-03-21', '2020-03-22','Alan', 112
union all select 1 , 400 , '2020-05-01', '2020-05-10','Joe', 225
union all select 2 , 155 , '2020-06-13', '2020-06-18','Joe', 225
union all select 1 , 200 , '2020-07-01', '2020-07-08','Smith', 980
union all select 2 , 544 , '2020-07-08', '2020-07-10','Smith', 980
union all select 3 , 428 , '2020-09-01', '2020-09-05','Smith', 980
--insert into #temp table which have depature date <> arrivedate
select * into #temp
from #Aridept a
where a.rownum>1 and ArrivalDate not in
(select DepartureDate from #Aridept b where a.GuestID=b.guestid
and rownum=1 )
final result query
select * from (
select * from #Aridept Ari
where rownum=1 and GuestID in ( select GuestID from #temp)
union all
select * from #temp
)a order by GuestID, rownum

group by on date

I have two tables, Sales and Calls as follows:
**Sales**
CUST_ID INT primary key,
CUST_NM Varchar(40),
Sale_date Datetime2,
SALES Money);
CUST_ID CUST_NM Sale_date SALES
1 Dom 2015-01-01 15:00:02.3000000 10.00
2 Brian 2015-01-02 15:00:02.3000000 12.00
3 Stu 2015-01-03 15:00:02.3000000 21.00
4 John 2015-01-04 15:00:02.3000000 41.00
5 Jack 2015-01-05 15:00:02.3000000 51.00
6 Jill 2015-01-05 15:00:02.3000000 61.00
7 Steve 2015-01-04 15:00:02.3000000 16.00
8 Stacey 2015-01-03 15:00:02.3000000 19.00
9 Lacey 2015-01-03 15:00:02.3000000 30.00
Calls
NAME Varchar(40),
CALL_DATE Date,
TOTAL_CALLS INT
NAME CALL_DATE TOTAL_CALLS
Dom 2015-01-01 2
Brian 2015-01-02 4
Stu 2015-01-03 3
John 2015-01-04 5
Jack 2015-01-05 6
Jill 2015-01-05 10
Steve 2015-01-04 8
Stacey 2015-01-03 7
Lacey 2015-01-03 9
I want to write a select statement that brings back the date, gross sales, and the total calls from both Sales and Calls, joined on date.
Here is what I wrote, and I think it should be right, but somehow I am not getting the right output.
select Calls.CALL_DATE, sum(Sales.SALES) as gross_sale, sum(Calls.TOTAL_CALLS) as gross_total_calls
from Sales
join
Calls
on convert (date,sales.Sale_date)=calls.CALL_DATE
group by Calls.CALL_DATE
order by Calls.CALL_DATE
The output I am getting is
CALL_DATE gross_sale gross_total_calls
2015-01-01 10.00 2
2015-01-02 12.00 4
2015-01-03 210.00 57
2015-01-04 114.00 26
2015-01-05 224.00 32
Where am I going wrong??
You are generating a Cartesian product for each day. You need to aggregate before the join. Or, you can do this with a union all and aggregation:
select dte, sum(sales) as sales, sum(calls) as total_calls
from ((select cast(s.sale_date as date) as dte, sales, 0 as calls
from sales s
) union all
(select call_date, 0, total_calls as calls
from calls c
)
) sc
group by dte
order by dte;
The alternative way as suggested by Gordon using table variables to create a runnable test script.
Note the two extra rows of data and the FULL OUTER JOIN which allows all data to be returned.
declare #Sales table (CUST_ID INT primary key, CUST_NM Varchar(40), Sale_date Datetime2,SALES Money);
insert into #Sales (CUST_ID, CUST_NM, Sale_date, SALES)
select 1, 'Dom', '2015-01-01 15:00:02.3000000', 10.00 union
select 2, 'Brian', '2015-01-02 15:00:02.3000000', 12.00 union
select 3, 'Stu', '2015-01-03 15:00:02.3000000', 21.00 union
select 4, 'John', '2015-01-04 15:00:02.3000000', 41.00 union
select 5, 'Jack', '2015-01-05 15:00:02.3000000', 51.00 union
select 6, 'Jill', '2015-01-05 15:00:02.3000000', 61.00 union
select 7, 'Steve', '2015-01-04 15:00:02.3000000', 16.00 union
select 8, 'Stacey', '2015-01-03 15:00:02.3000000', 19.00 union
select 9, 'Lacey', '2015-01-03 15:00:02.3000000', 30.00 union
select 10, 'Tom', '2015-01-07 15:00:02.3000000', 1.00
declare #Calls table (NAME Varchar(40), CALL_DATE Date, TOTAL_CALLS INT)
insert into #Calls (NAME, CALL_DATE, TOTAL_CALLS)
select 'Dom', '2015-01-01', 2 union
select 'Brian', '2015-01-02', 4 union
select 'Stu', '2015-01-03', 3 union
select 'John', '2015-01-04', 5 union
select 'Jack', '2015-01-05', 6 union
select 'Jill', '2015-01-05', 10 union
select 'Steve', '2015-01-04', 8 union
select 'Stacey', '2015-01-03', 7 union
select 'Lacey', '2015-01-03', 9 union
select 'Tom', '2015-01-06', 1
select * from #Sales
select * from #Calls
select ISNULL (a.CALL_DATE, b.CALL_DATE) as CALL_DATE, gross_sale, TOTAL_CALLS
from
( select convert(date, Sale_date) as CALL_DATE, sum(SALES) as gross_sale
from #Sales
group by convert(date, Sale_date)
) a
full outer join
( select CALL_DATE, SUM(TOTAL_CALLS) as TOTAL_CALLS
from #Calls
group by CALL_DATE
) b on a.CALL_DATE = b.CALL_DATE
order by a.CALL_DATE