Hive windowing query - hive

I have a base Hive table with following schema:
And I want the below output:
So basically, grouping on all columns, and calculating the count distinct Encounters in that month and last 3 months (including that month).
For example, for DischargeMonthYear Jan-2018, num_discharges_last_30_days would be patients discharged in Jan-2018 (3) and num_discharges_last_90_days would be patients discharged in Nov-17, Dec-17 and Jan-18. Since there is no data before Jan-18 in this case, both counts would be the same.
Similarly for Mar-18, num_discharges_last_90_days should include counts for Jan, Feb and Mar-18 months (3+2+2 = 7).
For Jun-18, since we have no data for Apr and May-18, it should include counts only for Jun-18 and NOT got to the previous group/partition.
I have the below query that gives me the correct total for num_discharges_last_90_days till Jun-18 but does not follow the grouping of earlier columns and for Jul-18 it also includes Jun-18 totals which should not be the case since the region is different.
If I add a PARTITION BY region (and others) clause for it, num_discharges_last_90_days is correct for Jul-18 now, but incorrect for Jun-18 since it includes the Feb and Mar-18 totals.
`
DROP TABLE IF EXISTS Encounter;
CREATE TEMPORARY TABLE Encounter
(
Encounter_no int,
Admit_date date,
discharge_date date,
region varchar(50),
Facilityname varchar(50),
Payertype varchar(10),
Payernamme varchar(20),
patient_type varchar(10)
);
INSERT INTO Encounter
select 12345, '2018-01-01', '2018-01-05', 'Midwest', 'ABC', 'MCR', 'MCR123', 'IP' union all
select 12346, '2018-01-02', '2018-01-06', 'Midwest', 'ABC', 'MCR', 'MCR123', 'IP' union all
select 12347, '2018-01-03', '2018-01-07', 'Midwest', 'ABC', 'MCR', 'MCR123', 'IP' union all
select 12348, '2018-02-04', '2018-02-08', 'Midwest', 'ABC', 'MCR', 'MCR123', 'IP' union all
select 12349, '2018-02-05', '2018-02-09', 'Midwest', 'ABC', 'MCR', 'MCR123', 'IP' union all
select 12350, '2018-03-06', '2018-03-10', 'Midwest', 'ABC', 'MCR', 'MCR123', 'IP' union all
select 12351, '2018-03-07', '2018-03-11', 'Midwest', 'ABC', 'MCR', 'MCR123', 'IP' union all
select 12352, '2018-06-08', '2018-06-12', 'Midwest', 'ABC', 'MCR', 'MCR123', 'IP' union all
select 12353, '2018-06-09', '2018-06-13', 'Midwest', 'ABC', 'MCR', 'MCR123', 'IP' union all
select 12354, '2018-07-10', '2018-07-14', 'NorthEast', 'ABC', 'MCR', 'MCR123', 'IP'
;
--SELECT from_unixtime(unix_timestamp(e.discharge_date, 'yyyy-MM-dd'),'MM') AS `Discharge_Month` FROM Encounter e
--Below CTE is used to get all month numbers
WITH R AS
(
SELECT '01' AS MonthNum
UNION ALL SELECT '02'
UNION ALL SELECT '03'
UNION ALL SELECT '04'
UNION ALL SELECT '05'
UNION ALL SELECT '06'
UNION ALL SELECT '07'
UNION ALL SELECT '08'
UNION ALL SELECT '09'
UNION ALL SELECT '10'
UNION ALL SELECT '11'
UNION ALL SELECT '12'
)
SELECT * FROM
(
--Perform a left join on CTE with your query to get all months
SELECT
R.MonthNum,
e.region,
e.facilityname,
from_unixtime(unix_timestamp(e.discharge_date, 'yyyy-MM-dd'),'MMM-yyyy') AS Discharge_Month,
e.Payertype,
e.Payernamme,
e.patient_type,
CASE WHEN COALESCE(e.region, '') <> ''
THEN COUNT(1)
ELSE 0
END
as num_discharges_last_30_days,
SUM(
CASE WHEN COALESCE(e.region, '') <> ''
THEN COUNT(1)
ELSE 0
END
)
OVER (ORDER BY R.MonthNum
ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
) as num_discharges_last_90_days
FROM R
LEFT JOIN Encounter e
ON R.MonthNum = from_unixtime(unix_timestamp(e.discharge_date, 'yyyy-MM-dd'),'MM')
GROUP BY
R.MonthNum,
e.region,
e.facilityname,
from_unixtime(unix_timestamp(e.discharge_date, 'yyyy-MM-dd'),'MMM-yyyy'),
e.Payertype,
e.Payernamme,
e.patient_type
) A
WHERE A.region IS NOT NULL
;
`

My colleague cracked the question using the below query. It needed a self-join and CASE & WHERE clauses to only count the last 3 months calculation.
WITH CTE AS (
SELECT a.region,a.facilityname,a.payertype,a.payernamme,a.patient_type, LAST_DAY(a.discharge_date) AS month_year, COUNT(encounter_no) AS measure_1
FROM Encounter AS a
GROUP BY a.region,a.facilityname,a.payertype,a.payernamme,a.patient_type, LAST_DAY(a.discharge_date)
)
-- SELECT * FROM CTE AS a;
SELECT a.region,a.facilityname,a.payertype,a.payernamme,a.patient_type, a.month_year, MAX(a.measure_1) AS measure_1,
SUM(IF(b.month_year IS NULL, a.measure_1, b.measure_1)) AS measure_2
FROM CTE AS a
LEFT JOIN CTE AS b
ON a.region = b.region
AND a.facilityname = b.facilityname
AND a.payertype = b.payertype
AND a.payernamme = b.payernamme
AND a.patient_type = b.patient_type
WHERE ( b.month_year BETWEEN add_months(a.month_year, -2) AND a.month_year
OR b.month_year IS NULL)
GROUP BY a.region,a.facilityname,a.payertype,a.payernamme,a.patient_type, a.month_year;

Related

Using Pivot in Oracle SQL to dynamically show one or two columns in case multiple records are present

I am working in Oracle Fusion HCM and would like to create a query which pulls an employee's base data such as name, location, etc. We also want to include the managers.
Our manager structure is as such so that there's 1 line manager and 1 to n (realistically not more than 3) matrix managers, named 'REVIEWER'.
I have a working code that fetches the data, but it gives issues when there's not exactly 2 managers. When there's 1, it shows the same name twice and if there's 3, there is one that is not shown.
Can anyone help me out on how to fetch the correct manager names without using the MIN/MAX aggregrates? My query is already fetching the correct data, but my pivot clause is not working correctly.
Select DISTINCT *
from
(
SELECT DISTINCT
emplName.DISPLAY_NAME Worker_Name,
INITCAP(loc.LOCATION_NAME) Location_Name,
gra.NAME Grade_Name,
hou.NAME Department_Name,
ass.MANAGER_TYPE Manager_Type,
mgr.DISPLAY_NAME Manager_Name,
REPLACE(ctr.CONTRACT_END_DATE,'4712-12-31') Contract_End_Date,
aa.ASSIGNMENT_NUMBER
FROM
PER_ALL_ASSIGNMENTS_M aa,
PER_ASSIGNMENT_SUPERVISORS_F ass,
PER_PERSON_NAMES_F emplName,
PER_ALL_PEOPLE_F empl,
PER_PERSON_NAMES_F mgr,
HR_ORGANIZATION_UNITS hou,
HR_LOCATIONS_ALL_F_VL loc,
PER_GRADES_F_TL gra,
PER_CONTRACTS_F ctr
WHERE
aa.ASSIGNMENT_ID (+) = ass.ASSIGNMENT_ID
AND emplName.PERSON_ID = ass.PERSON_ID
AND ass.MANAGER_ID = mgr.PERSON_ID
AND empl.PERSON_ID = ass.PERSON_ID
AND hou.ORGANIZATION_ID = aa.ORGANIZATION_ID
AND loc.LOCATION_ID = aa.LOCATION_ID
AND gra.GRADE_ID = aa.GRADE_ID
AND ctr.CONTRACT_ID = aa.CONTRACT_ID
AND aa.ASSIGNMENT_STATUS_TYPE = 'ACTIVE'
AND to_char(ass.EFFECTIVE_END_DATE, 'DD/MM/YYYY') = '31/12/4712'
AND to_char(aa.EFFECTIVE_END_DATE, 'DD/MM/YYYY') = '31/12/4712'
AND to_char(ctr.EFFECTIVE_END_DATE, 'DD/MM/YYYY') = '31/12/4712'
AND gra.SOURCE_LANG = 'US'
AND gra.NAME in (:p_grade)
AND hou.NAME in (:p_department)
AND INITCAP(loc.LOCATION_NAME) in (:p_location)
AND (ctr.CONTRACT_END_DATE <= (:p_contractenddate)
OR (:p_contractenddate) is null)
) S
Pivot
(
MAX(Manager_Name) Manager1,
MIN(Manager_Name) Manager2
for manager_type in
('LINE_MANAGER' as Line_Manager,
'REVIEWER' as Reviewer
))
Piv
The data regarding managers is recorded in PER_ASSIGNMENT_SUPERVISORS_F ass as follows:
ASSIGNMENT_ID
MANAGER_TYPE
MANAGER_ID
0129312
LINE_MANAGER
2343943
0129312
REVIEWER
456756
0129312
REVIEWER
456334
0129312
REVIEWER
234324
1232232
LINE_MANAGER
232242
1232232
REVIEWER
122312
Edit: Table formatting was broken
Use:
Select *
from (
SELECT ass.assignment_id,
ass.person_id,
ass.MANAGER_TYPE Manager_Type,
mgr.DISPLAY_NAME Manager_Name,
ROW_NUMBER() OVER (
PARTITION BY ass.assignment_id, ass.person_id, ass.manager_type
ORDER BY mgr.display_name
) AS rn
FROM PER_ASSIGNMENT_SUPERVISORS_F ass
INNER JOIN PER_PERSON_NAMES_F mgr
ON (ass.MANAGER_ID = mgr.PERSON_ID)
WHERE ass.EFFECTIVE_END_DATE = DATE '4712-12-31'
)
PIVOT (
MAX(Manager_Name)
for (manager_type, rn) in (
('LINE_MANAGER', 1) as Line_Manager,
('REVIEWER', 1) as Reviewer1,
('REVIEWER', 2) as Reviewer2,
('REVIEWER', 3) as Reviewer3
)
)
Then join the rest of the tables to that pivoted query (rather than trying to join first and then pivot).
Which, for the (minimal) sample data:
CREATE TABLE PER_ASSIGNMENT_SUPERVISORS_F (assignment_id, person_id, manager_id, manager_type, effective_end_date) AS
SELECT 1, 1, 2, 'LINE_MANAGER', DATE '4712-12-31' FROM DUAL UNION ALL
SELECT 1, 1, 3, 'REVIEWER', DATE '4712-12-31' FROM DUAL UNION ALL
SELECT 1, 1, 4, 'REVIEWER', DATE '4712-12-31' FROM DUAL UNION ALL
SELECT 1, 1, 5, 'REVIEWER', DATE '4712-12-31' FROM DUAL UNION ALL
SELECT 2, 2, 3, 'LINE_MANAGER', DATE '4712-12-31' FROM DUAL UNION ALL
SELECT 2, 2, 4, 'REVIEWER', DATE '4712-12-31' FROM DUAL UNION ALL
SELECT 2, 2, 5, 'REVIEWER', DATE '4712-12-31' FROM DUAL UNION ALL
SELECT 3, 3, 4, 'LINE_MANAGER', DATE '4712-12-31' FROM DUAL UNION ALL
SELECT 3, 3, 5, 'REVIEWER', DATE '4712-12-31' FROM DUAL UNION ALL
SELECT 4, 4, 5, 'LINE_MANAGER', DATE '4712-12-31' FROM DUAL;
CREATE TABLE PER_PERSON_NAMES_F (person_id, display_name) AS
SELECT 1, 'Alice' FROM DUAL UNION ALL
SELECT 2, 'Beryl' FROM DUAL UNION ALL
SELECT 3, 'Carol' FROM DUAL UNION ALL
SELECT 4, 'Debra' FROM DUAL UNION ALL
SELECT 5, 'Emily' FROM DUAL;
Outputs:
ASSIGNMENT_ID
PERSON_ID
LINE_MANAGER
REVIEWER1
REVIEWER2
REVIEWER3
1
1
Beryl
Carol
Debra
Emily
2
2
Carol
Debra
Emily
null
3
3
Debra
Emily
null
null
4
4
Emily
null
null
null
fiddle
Rewrote the query based on MT0s answer. For anyone interested in the end-result:
Select *
from
(
SELECT
emplName.DISPLAY_NAME Worker_Name,
INITCAP(loc.LOCATION_NAME) Location_Name,
gra.NAME Grade_Name,
hou.NAME Department_Name,
ass.MANAGER_TYPE Manager_Type,
mgr.DISPLAY_NAME Manager_Name,
ROW_NUMBER() OVER (
PARTITION BY aa.ASSIGNMENT_NUMBER, ass.assignment_id, ass.person_id, gra.NAME, hou.NAME, ass.manager_type
ORDER BY mgr.display_name
) AS rn,
REPLACE(ctr.CONTRACT_END_DATE,'4712-12-31') Contract_End_Date,
aa.ASSIGNMENT_NUMBER
FROM
PER_ALL_ASSIGNMENTS_F aa
LEFT JOIN PER_ASSIGNMENT_SUPERVISORS_F ass
ON (aa.ASSIGNMENT_ID = ass.ASSIGNMENT_ID
AND to_char(ass.EFFECTIVE_END_DATE, 'DD/MM/YYYY') = '31/12/4712')
LEFT JOIN PER_PERSON_NAMES_F_V emplName
ON (ass.PERSON_ID = emplName.PERSON_ID
AND to_char(emplName.EFFECTIVE_END_DATE, 'DD/MM/YYYY') = '31/12/4712'
AND emplName.NAME_TYPE = 'GLOBAL')
LEFT JOIN PER_ALL_PEOPLE_F empl
ON (empl.PERSON_ID = ass.PERSON_ID
AND to_char(empl.EFFECTIVE_END_DATE, 'DD/MM/YYYY') = '31/12/4712')
LEFT JOIN PER_PERSON_NAMES_F mgr
ON (mgr.PERSON_ID = ass.MANAGER_ID
AND to_char(mgr.EFFECTIVE_END_DATE, 'DD/MM/YYYY') = '31/12/4712'
AND mgr.NAME_TYPE = 'GLOBAL')
LEFT JOIN HR_ORGANIZATION_UNITS hou
ON (hou.ORGANIZATION_ID = aa.ORGANIZATION_ID
AND to_char(hou.DATE_TO, 'DD/MM/YYYY') = '31/12/4712')
LEFT JOIN HR_LOCATIONS_ALL_F_VL loc
ON (loc.LOCATION_ID = aa.LOCATION_ID
AND to_char(loc.EFFECTIVE_END_DATE, 'DD/MM/YYYY') = '31/12/4712')
LEFT JOIN PER_GRADES_F_TL gra
ON (gra.GRADE_ID = aa.GRADE_ID
AND gra.LANGUAGE = 'US'
AND to_char(gra.EFFECTIVE_END_DATE, 'DD/MM/YYYY') = '31/12/4712')
LEFT JOIN PER_CONTRACTS_F ctr
ON (ctr.CONTRACT_ID = aa.CONTRACT_ID
AND to_char(ctr.EFFECTIVE_END_DATE, 'DD/MM/YYYY') = '31/12/4712')
WHERE 1=1
AND aa.ASSIGNMENT_STATUS_TYPE = 'ACTIVE'
AND to_char(aa.EFFECTIVE_END_DATE, 'DD/MM/YYYY') = '31/12/4712'
-- PARAMETERS
AND gra.NAME in (:p_grade)
AND hou.NAME in (:p_department)
AND INITCAP(loc.LOCATION_NAME) in (:p_location)
AND (ctr.CONTRACT_END_DATE <= (:p_contractenddate)
OR (:p_contractenddate) is null)
) S
Pivot
(
MAX(Manager_Name)
for (manager_type, rn) in (
('LINE_MANAGER', 1) as Line_Manager,
('REVIEWER', 1) as Reviewer1,
('REVIEWER', 2) as Reviewer2,
('REVIEWER', 3) as Reviewer3
))
Piv

SQL query to Ignore matching positive and negative values in a table

I have a transaction table that stores amount paid(+amount) and corrected (-ve amount). I am looking for a query that would ignore a positive and a negative matching value of the amount for a date and post the sum of remaining number of transactions ignoring the 2 .
Id Dept Date Amount
1 A 21-Apr-21 1100
1 A 21-Apr-21 1100
1 A 21-Apr-21 -1100
1 A 07-Apr-21 1100
1 A 03-Feb-21 100
1 A 12-Jan-21 500
The sql query should ignore Rows 2 and 3 as the amount was corrected and should not be counted as a transaction.
o/p should be
Id Dept sum(Amount) count(transaction)
1 A 2800 4
If I got you well, you can use below solution for that purpose.
I first ranked all the occurrences of the same amount value, before I grouped them in order to make oracle ignore all matching positive and negative values.
with YourSample (Id, Dept, Date#, Amount) as (
select 1, 'A', to_date('21-Apr-21', 'dd-Mon-RR', 'nls_date_language=english'), 1100 from dual union all
select 1, 'A', to_date('21-Apr-21', 'dd-Mon-RR', 'nls_date_language=english'), 1100 from dual union all
select 1, 'A', to_date('21-Apr-21', 'dd-Mon-RR', 'nls_date_language=english'), -1100 from dual union all
select 1, 'A', to_date('07-Apr-21', 'dd-Mon-RR', 'nls_date_language=english'), 1100 from dual union all
select 1, 'A', to_date('03-Feb-21', 'dd-Mon-RR', 'nls_date_language=english'), 100 from dual union all
select 1, 'A', to_date('12-Jan-21', 'dd-Mon-RR', 'nls_date_language=english'), 500 from dual
)
, ranked_rws as (
select Id, Dept, Date#
, abs(Amount)Amount
, sign(AMOUNT) row_sign
, row_number() OVER (PARTITION BY Id, Dept, Amount order by date#, rownum) rn
from YourSample t
)
, ingored_matched_pos_neg_values as (
select ID, DEPT, sum(row_sign) * AMOUNT AMOUNT/*, sum(row_sign)*/
from ranked_rws
group by ID, DEPT, AMOUNT, RN
having sum(row_sign) != 0 /* this line filters out all matching positive
and negatives values (equality in terms of occurrences)*/
)
select ID, DEPT, sum(AMOUNT) sum, count(*) transactions
from ingored_matched_pos_neg_values
group by ID, DEPT
;
demo
Maybe some idea like this could work.
SELECT Id, Dept, Date, Amount, COUNT(*) AS RecordCount
INTO #temptable
FROM table GROUP BY ...
SELECT
t1.Id
,t1.Dept
,t1.Date
,(t1.RecordCount - COALESCE(t2.RecordCount, 0)) * t1.Amount
,t1.RecordCount - COALESCE(t2.RecordCount, 0)
FROM #temptable t1
LEFT JOIN #temptable t2 ON
t1.Id = t2.Id
AND t1.Dept = t2.Dept
AND t1.Date = t2.Date
AND (t1.Amount * -1) = t2.Amount

How to select next value

The task is to see the date of payment of the loan. If it falls on a date where there is no such number, it does not show data but should show the first date from the next month
Sql work good if i enter date 15.01.2019
But if i can enter date 31.01.2019 i have problem .
I can not see correct result sql request.
With days as (
Select rownum As Day from All_Objects where Rownum<=31
),
a as (Select 'WHWWHHWWWWWHHWWWWWHHWWWWWHHWWWW' as hl ,1 as Mnth,2019 as Yr from Dual
Union All
Select 'WHHWWWWWHHWWWWWHHWWWWWHHWWWW' as hl ,2 as Mnth,2019 as Yr from Dual
Union All
Select 'WHHWWWWHHHWWWWWHHWWHHHHHHHWWWHH' as hl ,3 as Mnth,2019 as Yr from Dual
Union All
Select 'WWWWWHHWWWWWHHWWWWWHHWWWWWHHWW' as hl ,4 as Mnth,2019 as Yr from Dual
Union All
Select 'WWWHHWWWHWHHWWWWWHHWWWWWHHWHWWW' as hl ,5 as Mnth,2019 as Yr from Dual
Union All
Select 'HHWWHHWHHWWWWWHHHWWWWHHWWHWWHH' as hl ,6 as Mnth,2019 as Yr from Dual
Union All
Select 'WWWWWHHWWWWWHHWWWWWHHWWWWWHHWWW' as hl ,7 as Mnth,2019 as Yr from Dual
)
,
Alll as
(Select TO_Date(Yr|| substr('0'||Mnth,-2,2)||substr('0'||Day,-2,2),'YYYYMMDD') as Dt,a.Yr,a.Mnth,Days.Day,substr(a.Hl,Days.Day,1) as Daytype from Days,a Where Days.Day<=Length(a.Hl)
),
Taksit as
(
Select To_Date('31.01.2019') as TDate, 1000 as Amount ,3 as Tcount from Dual
),
PD as (
Select
A.Dt,A.DayType , Case when A.DayType='H' then Min(W.Dt) else A.Dt end As PayableDate
From Alll A inner Join Alll W on W.DT>=A.DT and W.DayType='W'
Group by A.Dt, A.Daytype
Order by 1
),
PreResult as
(
Select PD.PayableDate,Amount,TCount,Max(PD.PayableDate) over (Partition by 'Contract') as MPD
From PD inner join Taksit T on PD.DT between add_months(T.TDate,1) and Add_Months(T.TDate,TCount)
and TO_Char(PD.DT,'DD')=TO_Char(T.TDate,'DD')
)
Select
PayableDate, Case when PayableDate=MPD then Amount-(Round(Amount/TCount,2)*(TCount-1)) else Round(Amount/TCount,2) end PayAmount
from PreResult
You have used TO_CHAR(PD.DT, 'DD') = TO_CHAR(T.TDATE, 'DD') but I don't think that Feb month has any date which will match with it.
Ideally, you should use add_month function as following in PRERESULT (I believe you need only 3 months data)
PRERESULT AS (
SELECT
PD.PAYABLEDATE,
AMOUNT,
TCOUNT,
MAX(PD.PAYABLEDATE) OVER(
PARTITION BY 'Contract'
) AS MPD
FROM
PD
INNER JOIN TAKSIT T ON PD.DT BETWEEN ADD_MONTHS(T.TDATE, 1) AND ADD_MONTHS(T.TDATE, TCOUNT)
AND PD.DT IN (ADD_MONTHS(T.TDATE, 1), ADD_MONTHS(T.TDATE, 2), ADD_MONTHS(T.TDATE, 3))
-- AND TO_CHAR(PD.DT, 'DD') = TO_CHAR(T.TDATE, 'DD')
)
It is giving 3 dates with 31.01.2019 and also it is working as expected in the case of 15.01.2019 also.
I think you should check if it is giving an expected result with 31.01.2019 as you have not mentioned the expected result. see this db<>fiddle demo
Cheers!!

SQL Query That Find An Average After Different Dates For Each Row

I am trying to find an average score both before and after a given date, where each user has their own date I would like to use.
I have 2 tables, the first includes the agent name, score, and date:
Name Score Date
---- ----- ----
Dan 81 10/1/2016
Brad 35 8/5/2016
Allison 92 6/3/2016
Cindy 95 8/12/2016
Dan 45 7/16/2016
Cindy 77 4/16/2016
Allison 59 3/22/2016
Brad 55 3/22/2016
The 2nd table includes the agent name and the date they recieved a training
Agent_name Training_date
---------- ----------
Dan 8/28/2016
Brad 4/15/2016
Cindy 3/3/2016
Allison 5/1/2016
What I am looking for is an output that includes the name, training date, average before training, and average after training. Ideally will look something like this
Agent_name Training_date Avg_pre_training Avg_post_training
---------- ------------- ---------------- -----------------
Dan 8/28/2016 45 81
Brad 4/15/2016 55 35
Cindy 3/3/2016 0 86
Allison 5/1/2016 59 92
I just can't seem to get a query that recognizes each person has their own date I need to take into account.
Below is for BigQuery Standard SQL
#standardSQL
SELECT
Agent_name, Training_date,
ROUND(AVG(CASE WHEN date <= Training_date THEN Score END)) AS Avg_pre_training,
ROUND(AVG(CASE WHEN date > Training_date THEN Score END)) AS Avg_post_training
FROM (
SELECT
Agent_name, Score,
PARSE_DATE('%m/%d/%Y', date) AS date,
PARSE_DATE('%m/%d/%Y', Training_date) AS Training_date
FROM training JOIN agents
ON Name = Agent_name
)
GROUP BY Agent_name, Training_date
-- ORDER BY Agent_name, Training_date
You can play with this query using dummy data from your example in question
#standardSQL
WITH agents AS (
SELECT 'Dan' AS Name, 81 AS Score, '10/1/2016' AS date UNION ALL
SELECT 'Brad', 35, '8/5/2016' UNION ALL
SELECT 'Allison', 92, '6/3/2016' UNION ALL
SELECT 'Cindy', 95, '8/12/2016' UNION ALL
SELECT 'Dan', 45, '7/16/2016' UNION ALL
SELECT 'Cindy', 77, '4/16/2016' UNION ALL
SELECT 'Allison', 59, '3/22/2016' UNION ALL
SELECT 'Brad', 55, '3/22/2016' UNION ALL
SELECT 'Allison', 70, '6/25/2016'
),
training AS (
SELECT 'Dan' AS Agent_name, '8/28/2016' AS Training_date UNION ALL
SELECT 'Brad', '4/15/2016' UNION ALL
SELECT 'Cindy', '3/3/2016' UNION ALL
SELECT 'Allison', '5/1/2016' UNION ALL
SELECT 'Allison', '6/28/2016'
)
SELECT
Agent_name, Training_date,
ROUND(AVG(CASE WHEN date <= Training_date THEN Score END)) AS Avg_pre_training,
ROUND(AVG(CASE WHEN date > Training_date THEN Score END)) AS Avg_post_training
FROM (
SELECT
Agent_name, Score,
PARSE_DATE('%m/%d/%Y', date) AS date,
PARSE_DATE('%m/%d/%Y', Training_date) AS Training_date
FROM training JOIN agents
ON Name = Agent_name
)
GROUP BY Agent_name, Training_date
-- ORDER BY Agent_name, Training_date
Note: I added few rows to make example more generic to address case of multiple trainings for the same user
Please see my answer below, using the where statement I control for pre and post training and then join two tables back together to get the result set.
CREATE TABLE #SET1
(
NAME VARCHAR(20),
SCORE INT,
[DATE] DATE
)
CREATE TABLE #TRAININGDATE
(
NAME VARCHAR(20),
TRAINING_DATE DATE
)
INSERT INTO #SET1
( NAME, SCORE, DATE )
VALUES
('Dan',81,'10/1/2016'),
('Brad',35,'8/5/2016'),
('Allison',92,'6/3/2016'),
('Cindy',95,'8/12/2016'),
('Dan',45,'7/16/2016'),
('Cindy',77,'4/16/2016'),
('Allison',59,'3/22/2016'),
('Brad',55,'3/22/2016')
INSERT INTO #TRAININGDATE
VALUES
('DAN','8/28/2016'),
('BRAD','4/15/2016'),
('CINDY','3/3/2016'),
('ALLISON','5/1/2016')
SELECT AVG(SCORE) AS AVERAGE_SCORE_BEFORE, A.NAME
INTO #TEMP_A
FROM #SET1 AS A
LEFT JOIN #TRAININGDATE AS B
ON A.NAME = B.NAME
WHERE DATE < B.TRAINING_DATE
GROUP BY A.NAME
SELECT AVG(SCORE) AS AVERAGE_SCORE_AFTER_TRAINING, A.NAME
INTO #TEMP_B
FROM #SET1 AS A
LEFT JOIN #TRAININGDATE AS B
ON A.NAME = B.NAME
WHERE DATE > B.TRAINING_DATE
GROUP BY A.NAME
SELECT A.NAME,ISNULL(B.AVERAGE_SCORE_BEFORE,0) AS AVERAGE_PRE_TRAINING,A.AVERAGE_SCORE_AFTER_TRAINING
FROM #TEMP_B AS A
LEFT JOIN #TEMP_A AS B
ON A.NAME = B.NAME
You can use derived tables to accomplish this:
SELECT T.Agent_Name, T.Training_Date, Avg_Pre_Training, Avg_Post_Training
FROM Training as T
JOIN (SELECT T.Agent_Name, AVG(Score) as Avg_Pre_Training
FROM Training as T
JOIN Scores as S on S.Name= T.Agent_Name
WHERE S.Date < T.Training_Date
GROUP BY T.Agent_Name
) as Pre on Pre.Agent_Name= T.Agent_Name
JOIN (SELECT T.Agent_Name, AVG(Score) as Avg_Post_Training
FROM Training as T
JOIN Scores as S on S.Name= T.Agent_Name
WHERE S.Date >= T.Training_Date
GROUP BY T.Agent_Name
) as Post on Post.Agent_Name= T.Agent_Name
Not totally sure I used alias' correctly for bigquery, and this is #legacySQL syntax, so it may need a few tweaks.
This should solve in Standard SQL:
with table1 as(
select 'Dan' as agent_name, 81 as score, '10/1/2016' as date union all
select 'Brad', 35, '8/5/2016' union all
select 'Allison', 92, '6/3/2016' union all
select 'Cindy', 95, '8/12/2016' union all
select 'Dan', 45, '7/16/2016' union all
select 'Cindy', 77, '4/16/2016' union all
select 'Allison', 59, '3/22/2016' union all
select 'Brad', 55, '3/22/2016'),
table2 as(
select 'Dan' agent_name, '8/28/2016' as train_date union all
select 'Brad', '4/15/2016' union all
select 'Cindy', '3/3/2016' union all
select 'Allison', '5/1/2016'
)
select
t1.agent_name name,
t2.train_date train_date,
avg(case when parse_date("%m/%d/%Y", t2.train_date) >= parse_date("%m/%d/%Y", t1.date) then t1.score end) pre_score,
avg(case when parse_date("%m/%d/%Y", t2.train_date) < parse_date("%m/%d/%Y", t1.date) then t1.score end) pos_score
from table1 t1
join table2 t2
on t1.agent_name = t2.agent_name
group by name, train_date
It's highly recommended that you use this version in BigQuery.

How to select statistics table counting occurrences between two dates

I need to count occurrences of protocol violations and durations between 2 dates from table to achieve effect like statistics table which will look like at the picture below:
Expected effect:
Explanation:
As you can see I need to select 'Country', 'Site' existing in Violations table and: 'Numbers', 'Maximum', 'Minimum' and 'Mean' of protocol violations duration existing in DB in the same table 'Violations' between two dates. So we have to count:
protocol violations occurrences existing in Violations table by country and site
min/max/avg durations of protocol violations by country and site
under two different conditions:
occurrences from Date Discovered to Date Reported
occurrences from Date Reported to Date Confirmed
Database Structure:
Available at SQLFILDDLE: Look HERE
I will add that code in attached SQLFIDDLE has more tables and an query but they are unnecessary right now for this problem. Feel free to use it.
I didn't remove old query because there is nice way to do:
'- All -' and
'- Unknown -' values. -
Violation table:
create table violations (
id long,
country varchar(20),
site varchar(20),
status_id int,
trial_id int,
discovered_date date,
reporded_date date,
confirmed_date date
);
Site table:
create table site (
id long,
site varchar(20)
);
My First try:
Here is my new SQLFIDDLE with query needed to improve commented lines:
SELECT v.country as country, v.site as site,
COUNT(*) as N --,
--MAX(list of durations in days between discovered date to repored date on each violation by country and site) as "Maximum",
--MIN(list of durations in days between discovered date to repored date on each violation by country and site) as "Minimum",
--AVG(list of durations in days between discovered date to repored date on each violation by country and site) as "Mean"
FROM violations v
WHERE v.trial_id = 3
GROUP BY ROLLUP (v.country, v.site)
I've managed to create abstract query with my idea. But I have a problem to write correct query for MAX, MIN and AVG where we must select max/min/avg value from list of durations in days between discovered date to reported date on each violation by country and site.
Could you help me please?
Please check this query. It is simplified and may give you an idea and direction. If you need more then this then let me know. Copy and paste to see results. This query will select and calc only the results between two dates in where clause. You need to run inner query first w/out where to see all dates etc... This query counts violations between 2 dates. Not sure what is the list of duration in days... See below for count of duration. You may add MAX/MIN etc...
-- Days between (duration) = (end_date-start_date) = number of days (number) --
SELECT (to_date('14-MAR-2013') - to_date('01-MAR-2013')) days_between
FROM dual
/
SELECT country, site
, Count(*) total_viol
, MAX(susp_viol) max_susp_viol
, MIN(susp_viol) min_susp_viol
FROM
(
SELECT 'GERMANY' country, '12222' site, 1 susp_viol, 2 conf_viol, trunc(Sysdate-30) disc_date, trunc(Sysdate-25) conf_date
FROM dual
UNION
SELECT 'GERMANY', '12222' , 3 , 14, trunc(Sysdate-20) , trunc(Sysdate-15) FROM dual
UNION
SELECT 'GERMANY', '12222' , 6 , 25, trunc(Sysdate-20) , trunc(Sysdate-15) FROM dual
UNION
SELECT 'GERMANY', '12222' , 2 , 1, trunc(Sysdate-20) , trunc(Sysdate-15) FROM dual
UNION
SELECT 'GERMANY', '13333' , 10 , 5, trunc(Sysdate-15) , trunc(Sysdate-10) FROM dual
UNION
SELECT 'GERMANY', '13333' , 15 , 3, trunc(Sysdate-15) , trunc(Sysdate-10) FROM dual
UNION
SELECT 'GERMANY', 'Unknown Site' , 0 , 7, trunc(Sysdate-5) , trunc(Sysdate-2) FROM dual
UNION
SELECT 'RUSSIA', '12345' , 1 , 5, trunc(Sysdate-20) , trunc(Sysdate-15) FROM dual
UNION
SELECT 'RUSSIA', '12345' , 2 , 10, trunc(Sysdate-15) , trunc(Sysdate-12) FROM dual
UNION
SELECT 'RUSSIA', 'Unknown Site' , 10 , 10, trunc(Sysdate-3) , trunc(Sysdate-1) FROM dual
)
-- replace sysdate with your_date-default format is to_date('14-MAR-2013') or give format mask
WHERE conf_date BETWEEN trunc(Sysdate-20) AND trunc(Sysdate-10)
GROUP BY ROLLUP (country, site)
ORDER BY country, site
/
Count of duration:
SELECT country, site, (conf_date-disc_date) duration, count(*) total_durations
FROM
(
SELECT 'GERMANY' country, '12222' site, 1 susp_viol, 2 conf_viol, trunc(Sysdate-30) disc_date, trunc(Sysdate-20) conf_date
FROM dual
UNION
SELECT 'GERMANY', '12222' , 3 , 14, trunc(Sysdate-20) , trunc(Sysdate-12) FROM dual
UNION
SELECT 'GERMANY', '12222' , 6 , 25, trunc(Sysdate-20) , trunc(Sysdate-12) FROM dual
UNION
SELECT 'GERMANY', '12222' , 2 , 1, trunc(Sysdate-20) , trunc(Sysdate-12) FROM dual
UNION
SELECT 'GERMANY', '13333' , 10 , 5, trunc(Sysdate-12) , trunc(Sysdate-6) FROM dual
UNION
SELECT 'GERMANY', '13333' , 15 , 3, trunc(Sysdate-17) , trunc(Sysdate-11) FROM dual
UNION
SELECT 'GERMANY', 'Unknown Site' , 0 , 7, trunc(Sysdate-5) , trunc(Sysdate-2) FROM dual
UNION
SELECT 'RUSSIA', '12345' , 1 , 5, trunc(Sysdate-20) , trunc(Sysdate-15) FROM dual
UNION
SELECT 'RUSSIA', '12345' , 2 , 10, trunc(Sysdate-15) , trunc(Sysdate-12) FROM dual
UNION
SELECT 'RUSSIA', 'Unknown Site' , 10 , 10, trunc(Sysdate-3) , trunc(Sysdate-1) FROM dual
)
WHERE conf_date BETWEEN trunc(Sysdate-20) AND trunc(Sysdate-10)
GROUP BY ROLLUP (country, site, (conf_date-disc_date))
ORDER BY country, site
/