get unique matches - sql

How can I get unique matches played between two teams
Game_Number, Team_A, Team_B, Date, Team_A_score, Team_B_score
1, IND, USA, 2020-01-01, 10, 20
2, USA, IND, 2020-01-02, 10, 20
3, AUS, IND, 2020-01-02, 30, 15
4, IND, AUS, 2020-01-03, 22, 34
5, UAE, AUS, 2020-01-04, 14, 41
Expected output
IND, USA
AUS, IND
UAE, AUS
In above case 1 and 2 games will be considered unique

If you do not mind the order of the teams then use DISTINCT with GREATEST and LEAST:
SELECT DISTINCT
LEAST(Team_A, Team_B) AS team_a,
GREATEST(Team_A, Team_B) AS team_b
FROM table_name
Which, for the sample data:
CREATE TABLE table_name (Game_Number, Team_A, Team_B, "DATE", Team_A_score, Team_B_score) AS
SELECT 1, 'IND', 'USA', DATE '2020-01-01', 10, 20 FROM DUAL UNION ALL
SELECT 2, 'USA', 'IND', DATE '2020-01-02', 10, 20 FROM DUAL UNION ALL
SELECT 3, 'AUS', 'IND', DATE '2020-01-02', 30, 15 FROM DUAL UNION ALL
SELECT 4, 'IND', 'AUS', DATE '2020-01-03', 22, 34 FROM DUAL UNION ALL
SELECT 5, 'UAE', 'AUS', DATE '2020-01-04', 14, 41 FROM DUAL;
Outputs:
TEAM_A
TEAM_B
IND
USA
AUS
IND
AUS
UAE
If you want to respect the order of the teams and get the earliest instance of a pairing of teams then:
SELECT Team_A,
Team_B
FROM (
SELECT Team_A,
Team_B,
ROW_NUMBER() OVER (
PARTITION BY LEAST(Team_A, Team_B), GREATEST(Team_A, Team_B)
ORDER BY "DATE"
) AS rn
FROM table_name
)
WHERE rn = 1;
Which outputs:
TEAM_A
TEAM_B
AUS
IND
UAE
AUS
IND
USA
db<>fiddle here

Related

how to use windows function during merge in sql

I am working in oracle sql. I have two table which is linked to each other by one column - company_id (see on the picture); I want to merge table 1 to table 2 and calculate 6 month average (6 month before period from table 2) of income for each company_id and each date of table2. I appreciate any code/idea how to solve this task.
You can use an analytic range window to calculate the averages for table1 and then JOIN the result to table2:
SELECT t2.*,
t1.avg_income_6,
t1.avg_income_12
FROM table2 t2
LEFT OUTER JOIN (
SELECT company_id,
dt,
ROUND(AVG(income) OVER (
PARTITION BY company_id
ORDER BY dt
RANGE BETWEEN INTERVAL '5' MONTH PRECEDING
AND INTERVAL '0' MONTH FOLLOWING
), 2) AS avg_income_6,
ROUND(AVG(income) OVER (
PARTITION BY company_id
ORDER BY dt
RANGE BETWEEN INTERVAL '11' MONTH PRECEDING
AND INTERVAL '0' MONTH FOLLOWING
), 2) AS avg_income_12
FROM table1
) t1
ON (t2.company_id = t1.company_id AND t2.dt = t1.dt);
Which, for the sample data:
CREATE TABLE table1 (company_id, dt, income) AS
SELECT 1, date '2019-01-01', 65 FROM DUAL UNION ALL
SELECT 1, date '2019-02-01', 58 FROM DUAL UNION ALL
SELECT 1, date '2019-03-01', 12 FROM DUAL UNION ALL
SELECT 1, date '2019-04-01', 81 FROM DUAL UNION ALL
SELECT 1, date '2019-05-01', 38 FROM DUAL UNION ALL
SELECT 1, date '2019-06-01', 81 FROM DUAL UNION ALL
SELECT 1, date '2019-07-01', 38 FROM DUAL UNION ALL
SELECT 1, date '2019-08-01', 69 FROM DUAL UNION ALL
SELECT 1, date '2019-09-01', 54 FROM DUAL UNION ALL
SELECT 1, date '2019-10-01', 90 FROM DUAL UNION ALL
SELECT 1, date '2019-11-01', 10 FROM DUAL UNION ALL
SELECT 1, date '2019-12-01', 12 FROM DUAL UNION ALL
SELECT 1, date '2020-01-01', 11 FROM DUAL UNION ALL
SELECT 1, date '2020-02-01', 83 FROM DUAL UNION ALL
SELECT 1, date '2020-03-01', 18 FROM DUAL UNION ALL
SELECT 1, date '2020-04-01', 28 FROM DUAL UNION ALL
SELECT 1, date '2020-05-01', 52 FROM DUAL UNION ALL
SELECT 1, date '2020-06-01', 21 FROM DUAL UNION ALL
SELECT 1, date '2020-07-01', 54 FROM DUAL UNION ALL
SELECT 1, date '2020-08-01', 30 FROM DUAL UNION ALL
SELECT 1, date '2020-09-01', 12 FROM DUAL UNION ALL
SELECT 1, date '2020-10-01', 25 FROM DUAL UNION ALL
SELECT 1, date '2020-11-01', 86 FROM DUAL UNION ALL
SELECT 1, date '2020-12-01', 4 FROM DUAL UNION ALL
SELECT 1, date '2021-01-01', 10 FROM DUAL UNION ALL
SELECT 1, date '2021-02-01', 72 FROM DUAL UNION ALL
SELECT 1, date '2021-03-01', 65 FROM DUAL UNION ALL
SELECT 1, date '2021-04-01', 25 FROM DUAL;
CREATE TABLE table2 (company_id, dt) AS
SELECT 1, date '2019-06-01' FROM DUAL UNION ALL
SELECT 1, date '2019-09-01' FROM DUAL UNION ALL
SELECT 1, date '2019-12-01' FROM DUAL UNION ALL
SELECT 1, date '2020-01-01' FROM DUAL UNION ALL
SELECT 1, date '2020-07-01' FROM DUAL UNION ALL
SELECT 1, date '2020-08-01' FROM DUAL UNION ALL
SELECT 1, date '2021-03-01' FROM DUAL UNION ALL
SELECT 1, date '2021-04-01' FROM DUAL;
Outputs:
COMPANY_ID
DT
AVG_INCOME_6
AVG_INCOME_12
1
2019-06-01 00:00:00
55.83
55.83
1
2019-09-01 00:00:00
60.17
55.11
1
2019-12-01 00:00:00
45.5
50.67
1
2020-01-01 00:00:00
41
46.17
1
2020-07-01 00:00:00
42.67
41.83
1
2020-08-01 00:00:00
33.83
38.58
1
2021-03-01 00:00:00
43.67
38.25
1
2021-04-01 00:00:00
43.67
38
db<>fiddle here
I don't think you need any window function here (if you were thinking of analytic functions); ordinary avg with appropriate join conditions should do the job.
Sample data:
SQL> with
2 table1 (company_id, datum, income) as
3 (select 1, date '2019-01-01', 65 from dual union all
4 select 1, date '2019-02-01', 58 from dual union all
5 select 1, date '2019-03-01', 12 from dual union all
6 select 1, date '2019-04-01', 81 from dual union all
7 select 1, date '2019-05-01', 38 from dual union all
8 select 1, date '2019-06-01', 81 from dual union all
9 select 1, date '2019-07-01', 38 from dual union all
10 select 1, date '2019-08-01', 69 from dual union all
11 select 1, date '2019-09-01', 54 from dual union all
12 select 1, date '2019-10-01', 90 from dual union all
13 select 1, date '2019-11-01', 10 from dual union all
14 select 1, date '2019-12-01', 12 from dual
15 ),
16 table2 (company_id, datum) as
17 (select 1, date '2019-06-01' from dual union all
18 select 1, date '2019-09-01' from dual union all
19 select 1, date '2019-12-01' from dual union all
20 select 1, date '2020-01-01' from dual union all
21 select 1, date '2020-07-01' from dual
22 )
Query begins here:
23 select b.company_id,
24 b.datum ,
25 round(avg(a.income), 2) result
26 from table1 a join table2 b on a.company_id = b.company_id
27 and a.datum > add_months(b.datum, -6)
28 and a.datum <= b.datum
29 group by b.company_id, b.datum;
COMPANY_ID DATUM RESULT
---------- -------- ----------
1 01.06.19 55,83
1 01.09.19 60,17
1 01.12.19 45,5
1 01.01.20 47
SQL>

SQL Oracle - Sales Forecast

I am doing a Seles forecast in SQL Oracle. I need to figure out what revenue I can expect for next year. I should calculate for each month(In my example January, February 2018 for Each customer by State/City) . I have data for 3 years.
The result should contain an estimated sales forecast for each month based on the city+state combination. I was trying to use use req_slope, but it doesn't work. I have code here: SQL Fiddle
select c.*,
max(year) +1 forecast_year,
regr_slope(revenue, year)
* (max(year) + 1)
+ regr_intercept(revenue, year) forecasted_revenue
from New_customer_data c
group by Cust_ID ,
State ,
City ,
year ,
id_month ,
revenue ;
I need to figure out what revenue I can expect for next year.
Remove revenue and year from the GROUP BY clause as those are the columns you want to perform the regression on:
select cust_id,
city,
state,
id_month,
max(year) +1 forecast_year,
regr_slope(revenue, year)
* (max(year) + 1)
+ regr_intercept(revenue, year) forecasted_revenue
from New_customer_data c
group by
Cust_ID,
city,
state,
id_month;
Which, for your sample data:
insert into New_customer_data
select 1, 'MN' , 'Minneapolis', 2016, 1, 679862 from dual union all
select 1, 'IL', 'Chicago' , 2016, 2, 11862 from dual union all
select 1, 'MN' , 'Minneapolis', 2017, 1,547365 from dual union all
select 1, 'IL', 'Chicago' , 2017, 2, 705365 from dual union all
select 2, 'CA', 'San Diego', 2016, 1, 51074 from dual union all
select 2, 'CA', 'LA', 2016, 2, 598862 from dual union all
select 2, 'CA', 'San Diego', 2017, 1, 705365 from dual union all
select 2,'CA', 'LA', 2017, 2, 50611 from dual union all
select 3, 'CA', 'Santa Monica', 2016, 1, 190706 from dual union all
select 3, 'IL', 'Evanston', 2016, 2, 679862 from dual union all
select 3, 'CA', 'Santa Monica', 2017, 1, 705365 from dual union all
select 3, 'IL', 'Evanston', 2017, 2, 90393 from dual union all
select 4, 'MN', 'Shakopee', 2016, 1, 31649 from dual union all
select 4, 'FL', 'Miami', 2016, 2,888862 from dual union all
select 4, 'MN', 'Shakopee', 2017, 1, 125365 from dual union all
select 4, 'FL', 'Miami', 2017, 2, 30566 from dual;
Outputs:
CUST_ID
CITY
STATE
ID_MONTH
FORECAST_YEAR
FORECASTED_REVENUE
1
Minneapolis
MN
1
2018
414868
1
Chicago
IL
2
2018
1398868
2
San Diego
CA
1
2018
1359656
2
LA
CA
2
2018
-497640
3
Santa Monica
CA
1
2018
1220024
3
Evanston
IL
2
2018
-499076
4
Shakopee
MN
1
2018
219081
4
Miami
FL
2
2018
-827730
db<>fiddle here

Complex query analyzing historical records

I am using Oracle and trying to retrieve the total number of days a person was out of the office during the year. I have 2 tables involved:
Statuses
1 - Active
2 - Out of the Office
3 - Other
ScheduleHistory
RecordID - primary key
PersonID
PreviousStatusID
NextStatusID
DateChanged
I can easily find when the person went on vacation and when they came back, using
SELECT DateChanged FROM ScheduleHistory WHERE PersonID=111 AND NextStatusID = 2
and
SELECT DateChanged FROM ScheduleHistory WHERE PersonID=111 AND PreviousStatusID = 2
But in case a person went on vacation more than once, how can I can I calculate total number of days a person was out of the office. Is it possible to do programmatically, given only PersonID?
Here is some sample data:
RecordID PersonID PreviousStatusID NextStatusID DateChanged
-----------------------------------------------------------------------------
1 111 1 2 03/11/2020
2 111 2 1 03/13/2020
3 111 1 3 04/01/2020
4 111 3 1 04/07/2020
5 111 1 2 06/03/2020
6 111 2 1 06/05/2020
7 111 1 2 09/14/2020
8 111 2 1 09/17/2020
So from the data above, for the year 2020 for PersonID 111 the query should return 7
Try this:
with aux1 AS (
SELECT
a.*,
to_date(datechanged, 'MM/DD/YYYY') - LAG(to_date(datechanged, 'MM/DD/YYYY')) OVER(
PARTITION BY personid
ORDER BY
recordid
) lag_date
FROM
ScheduleHistory a
)
SELECT
personid,
SUM(lag_date) tot_days_ooo
FROM
aux1
WHERE
previousstatusid = 2
GROUP BY
personid;
If you want total days (or weekdays) for each year (and to account for periods when it goes over the year boundary) then:
WITH date_ranges ( personid, status, start_date, end_date ) AS (
SELECT personid,
nextstatusid,
datechanged,
LEAD(datechanged, 1, datechanged) OVER(
PARTITION BY personid
ORDER BY datechanged
)
FROM table_name
),
split_year_ranges ( personid, year, start_date, end_date, max_date ) AS (
SELECT personid,
TRUNC( start_date, 'YY' ),
start_date,
LEAST(
end_date,
ADD_MONTHS( TRUNC( start_date, 'YY' ), 12 )
),
end_date
FROM date_ranges
WHERE status = 2
UNION ALL
SELECT personid,
end_date,
end_date,
LEAST( max_date, ADD_MONTHS( end_date, 12 ) ),
max_date
FROM split_year_ranges
WHERE end_date < max_date
)
SELECT personid,
EXTRACT( YEAR FROM year) AS year,
SUM( end_date - start_date ) AS total_days,
SUM(
( TRUNC( end_date, 'IW' ) - TRUNC( start_date, 'IW' ) ) * 5 / 7
+ LEAST( end_date - TRUNC( end_date, 'IW' ), 5 )
- LEAST( start_date - TRUNC( start_date, 'IW' ), 5 )
) AS total_weekdays
FROM split_year_ranges
GROUP BY personid, year
ORDER BY personid, year
Which, for the sample data:
CREATE TABLE table_name ( RecordID, PersonID, PreviousStatusID, NextStatusID, DateChanged ) AS
SELECT 1, 111, 1, 2, DATE '2020-03-11' FROM DUAL UNION ALL
SELECT 2, 111, 2, 1, DATE '2020-03-13' FROM DUAL UNION ALL
SELECT 3, 111, 1, 3, DATE '2020-04-01' FROM DUAL UNION ALL
SELECT 4, 111, 3, 1, DATE '2020-04-07' FROM DUAL UNION ALL
SELECT 5, 111, 1, 2, DATE '2020-06-03' FROM DUAL UNION ALL
SELECT 6, 111, 2, 1, DATE '2020-06-05' FROM DUAL UNION ALL
SELECT 7, 111, 1, 2, DATE '2020-09-14' FROM DUAL UNION ALL
SELECT 8, 111, 2, 1, DATE '2020-09-17' FROM DUAL UNION ALL
SELECT 9, 222, 1, 2, DATE '2019-12-31' FROM DUAL UNION ALL
SELECT 10, 222, 2, 2, DATE '2020-12-01' FROM DUAL UNION ALL
SELECT 11, 222, 2, 2, DATE '2021-01-02' FROM DUAL;
Outputs:
PERSONID
YEAR
TOTAL_DAYS
TOTAL_WEEKDAYS
111
2020
7
7
222
2019
1
1
222
2020
366
262
222
2021
1
1
db<>fiddle here
Provided no vacation crosses a year boundary
with grps as (
SELECT sh.*,
row_number() over (partition by PersonID, NextStatusID order by DateChanged) grp
FROM ScheduleHistory sh
WHERE NextStatusID in (1,2) and 3 not in (NextStatusID, PreviousStatusID)
), durations as (
SELECT PersonID, min(DateChanged) DateChanged, max(DateChanged) - min(DateChanged) duration
FROM grps
GROUP BY PersonID, grp
)
SELECT PersonID, sum(duration) days_out
FROM durations
GROUP BY PersonID;
db<>fiddle
year_span is used to split an interval spanning across two years in two different records
H1 adds a row number dependent from PersonID to get the right sequence for each person
H2 gets the periods for each status change and extract 1st day of the year of the interval end
H3 split records that span across two years and calculate the right date_start and date_end for each interval
H calculates days elapsed in each interval for each year
final query sum up the records to get output
EDIT
If you need workdays instead of total days, you should not use total_days/7*5 because it is a bad approximation and in some cases gives weird results.
I have posted a solution to jump on fridays to mondays here
with
statuses (sid, sdescr) as (
select 1, 'Active' from dual union all
select 2, 'Out of the Office' from dual union all
select 3, 'Other' from dual
),
ScheduleHistory(RecordID, PersonID, PreviousStatusID, NextStatusID , DateChanged) as (
select 1, 111, 1, 2, date '2020-03-11' from dual union all
select 2, 111, 2, 1, date '2020-03-13' from dual union all
select 3, 111, 1, 3, date '2020-04-01' from dual union all
select 4, 111, 3, 1, date '2020-04-07' from dual union all
select 5, 111, 1, 2, date '2020-06-03' from dual union all
select 6, 111, 2, 1, date '2020-06-05' from dual union all
select 7, 111, 1, 2, date '2020-09-14' from dual union all
select 8, 111, 2, 1, date '2020-09-17' from dual union all
SELECT 9, 222, 1, 2, date '2019-12-31' from dual UNION ALL
SELECT 10, 222, 2, 2, date '2020-12-01' from dual UNION ALL
SELECT 11, 222, 2, 2, date '2021-01-02' from dual
),
year_span (n) as (
select 1 from dual union all
select 2 from dual
),
H1 AS (
SELECT ROW_NUMBER() OVER (PARTITION BY PersonID ORDER BY RecordID) PID, H.*
FROM ScheduleHistory H
),
H2 as (
SELECT
H1.*, H2.DateChanged DateChanged2,
EXTRACT(YEAR FROM H2.DateChanged) - EXTRACT(YEAR FROM H1.DateChanged) + 1 Y,
trunc(H2.DateChanged,'YEAR') Y2
FROM H1 H1
LEFT JOIN H1 H2 ON H1.PID = H2.PID-1 AND H1.PersonID = H2.PersonID
),
H3 AS (
SELECT Y, N, H2.PID, H2.RecordID, H2.PersonID, H2.NextStatusID,
CASE WHEN Y=1 THEN H2.DateChanged ELSE CASE WHEN N=1 THEN H2.DateChanged ELSE Y2 END END D1,
CASE WHEN Y=1 THEN H2.DateChanged2 ELSE CASE WHEN N=1 THEN Y2 ELSE H2.DateChanged2 END END D2
FROM H2
JOIN year_span N ON N.N <=Y
),
H AS (
SELECT PersonID, NextStatusID, EXTRACT(year FROM d1) Y, d2-d1 D
FROM H3
)
select PersonID, sdescr Status, Y, sum(d) d
from H
join statuses s on NextStatusID = s.sid
group by PersonID, sdescr, Y
order by PersonID, sdescr, Y
output
PersonID Status Y d
111 Active 2020 177
111 Other 2020 6
111 Out of the Office 2020 7
222 Out of the Office 2019 1
222 Out of the Office 2020 366
222 Out of the Office 2021 1
check the fiddle here

SQL query help to calculate max

I have a question regarding a query I need to do in SQL (i use BQ).
I have this table:
train_no, wagon_no, weight, length, date, startpoint(km), endpoint(km)
1, 123, 1000, 20, 20190101, 0, 7
1, 234, 2000, 20, 20190101, 1, 2
1, 345, 3000, 30, 20190101, 1, 5
1, 456, 1000, 40, 20190101, 1, 6
2, 987, 1000, 10, 20190101, 0, 8
2, 876, 2000, 20, 20190101, 1, 2
2, 765, 3000, 20, 20190101, 1, 5
2, 654, 1000, 20, 20190101, 1, 6
The table shows two trains with wagons. Per wagon we see at what point the wagon was added on the train. So for train no 1 we see that wagon 234 was included on the train from startpoint=1 (kilometer 1) to endpoint=2 (kilometer 2) then it was removed from the train. We also see that max endpoint is 7 for train_no =1 so max travelled distance for the train is 7 km.
The total train length and weight varies during the distance and I would like to calculate the maximum length and maximum weight reached during the distance. How can I do this in SQL?
Any suggestion is appreciated.
Edit:
Adding a pic to clearify what Im looking for.
As you can see in attached pic Train_no =1 have a max weight between point 1 and 2. Total weight is 7000 and is the total of all wagons in the train at that specific distance. Also, total length is 110 which is the total length of the all wagons added together.
Below is for BigQuery Standard SQL
#standardSQL
WITH temp AS (
SELECT train_no, dt, MIN(startpoint) startpoint, MAX(endpoint) endpoint
FROM `project.dataset.table`
GROUP BY train_no, dt
)
SELECT train_no, dt, MAX(wagons) max_wagons, MAX(total_weight) AS max_total_weight, MAX(total_len) max_total_len
FROM (
SELECT train_no, dt, point, COUNT(wagon_no) wagons, SUM(weight) total_weight, SUM(len) total_len
FROM temp, UNNEST(GENERATE_ARRAY(startpoint, endpoint)) point
LEFT JOIN `project.dataset.table` t
USING(train_no, dt)
WHERE point >= t.startpoint AND point < t.endpoint
GROUP BY train_no, dt, point
)
GROUP BY train_no, dt
If to apply to sample data from your question as in example below
#standardSQL
WITH `project.dataset.table` AS (
SELECT 1 train_no, 123 wagon_no, 1000 weight, 20 len, '20190101' dt, 0 startpoint, 7 endpoint UNION ALL
SELECT 1, 234, 2000, 20, '20190101', 1, 2 UNION ALL
SELECT 1, 345, 3000, 30, '20190101', 1, 5 UNION ALL
SELECT 1, 456, 1000, 40, '20190101', 1, 6 UNION ALL
SELECT 2, 987, 1000, 10, '20190101', 0, 8 UNION ALL
SELECT 2, 876, 2000, 20, '20190101', 1, 2 UNION ALL
SELECT 2, 765, 3000, 20, '20190101', 1, 5 UNION ALL
SELECT 2, 654, 1000, 20, '20190101', 1, 6
), temp AS (
SELECT train_no, dt, MIN(startpoint) startpoint, MAX(endpoint) endpoint
FROM `project.dataset.table`
GROUP BY train_no, dt
)
SELECT train_no, dt, MAX(wagons) max_wagons, MAX(total_weight) AS max_total_weight, MAX(total_len) max_total_len
FROM (
SELECT train_no, dt, point, COUNT(wagon_no) wagons, SUM(weight) total_weight, SUM(len) total_len
FROM temp, UNNEST(GENERATE_ARRAY(startpoint, endpoint)) point
LEFT JOIN `project.dataset.table` t
USING(train_no, dt)
WHERE point >= t.startpoint AND point < t.endpoint
GROUP BY train_no, dt, point
)
GROUP BY train_no, dt
result is
Row train_no dt max_wagons max_total_weight max_total_len
1 1 20190101 4 7000 110
2 2 20190101 4 7000 70
The following query returns the the length at each KM marker sorted in descending fashion for each train.
with data as (
select 1 as train_no, 123 as wagon_no, 1000 as weight, 20 as length, 20190101 as date, 0 as startpoint, 7 as endpoint union all
select 1, 234, 2000, 20, 20190101, 1, 2 union all
select 1, 345, 3000, 30, 20190101, 1, 5 union all
select 1, 456, 1000, 40, 20190101, 1, 6 union all
select 2, 987, 1000, 10, 20190101, 0, 8 union all
select 2, 876, 2000, 20, 20190101, 1, 2 union all
select 2, 765, 3000, 20, 20190101, 1, 5 union all
select 2, 654, 1000, 20, 20190101, 1, 6
),
km_array as (
select * from unnest(generate_array(0,10)) km
),
joined as (
select *
from km_array
cross join data
where km between startpoint and endpoint
),
train_length_at_each_km as (
select
km,
train_no,
sum(length) as length
from joined
group by 1,2
)
select
train_no, length, km
from train_length_at_each_km
order by train_no, length desc
Getting maximum weight would use similar logic as the train_length_at_each_km CTE.

How to convert this MYSQL SQL to HIVE SQL?

The table ProductOrder columns include:
id shopid starttime endtime
1 123 2018-04-27 2018-04-28
2 234 2018-04-23 2018-04-30
3 189 2018-05-01 2018-05-30
4 321 2018-05-01 2018-05-29
I wan't to query for valid shop counts between two days and count by each day of latest month,the valid shop counts means the starttime<= $curDate <= endtime,and curDate is a variable of the each day of the leatest month.
Today is 2018-04-27,so the query result should be:
day count
2018-04-27 2
2018-04-26 1
2018-04-25 1
2018-04-24 1
2018-04-23 1
2018-04-22 0
2018-04-21 0
……………………………………
2018-03-26 0
I achieve this requirement in MYSQL.This SQL can work well in MYSQL.How can I convert to Hive Sql?
SELECT
DATE_SUB(DATE(NOW()), INTERVAL days_ago.days DAY) day,
COUNT(distinct(shopID)) count
FROM
(SELECT 0 days UNION SELECT 1 UNION SELECT 2 UNION SELECT 3 UNION SELECT 4 UNION
SELECT 5 UNION SELECT 6 UNION SELECT 7 UNION SELECT 8 UNION SELECT 9 UNION
SELECT 10 UNION SELECT 11 UNION SELECT 12 UNION SELECT 13 UNION SELECT 14 UNION
SELECT 15 UNION SELECT 16 UNION SELECT 17 UNION SELECT 18 UNION SELECT 19 UNION
SELECT 20 UNION SELECT 21 UNION SELECT 22 UNION SELECT 23 UNION SELECT 24 UNION
SELECT 25 UNION SELECT 26 UNION SELECT 27 UNION SELECT 28 UNION SELECT 29)
AS days_ago
LEFT JOIN ProductOrder
ON DATE_SUB(DATE(NOW()), INTERVAL days_ago.days DAY) <= ProductOrder.endtime
AND DATE_SUB(DATE(NOW()), INTERVAL days_ago.days DAY) >= ProductOrder.starttime
AND status = 2
GROUP BY days_ago.days;
Hive does not support Non equi join conditions, they can be placed to the WHERE clause instead. Use STACK instead of many UNION subqueries.
select DATE_SUB(CURRENT_DATE, days_ago.days) day,
COUNT(DISTINCT(shopID)) count
from
(
select stack(30, --the number of elements
0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29) as (days)
) days_ago
LEFT JOIN ProductOrder po ON status = 2
WHERE (DATE_SUB(CURRENT_DATE, days_ago.days) <= po.endtime
AND DATE_SUB(CURRENT_DATE, days_ago.days) >= po.starttime)
OR po.shopID is NULL --allow nulls
GROUP BY DATE_SUB(CURRENT_DATE, days_ago.days);
SELECT DATE_SUB(CURRENT_DATE, days_ago.days),
COUNT(DISTINCT(shopID)) count
FROM
(
SELECT explode(array(
0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29)) as days
) days_ago
LEFT JOIN ProductOrder po ON
(
DATE_SUB(CURRENT_DATE, days_ago.days) <= po.endtime
AND DATE_SUB(CURRENT_DATE, days_ago.days) >= po.starttime
AND status = 2
)
GROUP BY days_ago.days;