Inserting parameters to SQL query in Oracle SQL - sql

In the following query between date time columns are repeated in multiple places and I need to replace them with two variables named start_date and end_date I tried multiple methods and had no luck. Please answer with a runnable query if you can. Thanks in advance.
WITH encounter
AS (SELECT patient_pomr_id AS encounter_number,
patient_id AS umrn,
doctor_id,
doctor_name
FROM eh_pomr.ehpom_patient_pomr
WHERE created_on BETWEEN timestamp '2022-08-01 00:00:00' AND
timestamp '2022-08-30 00:00:00'),
chief_complain
AS (SELECT chief_complain,
patient_pomr_id
FROM eh_pomr.ehpom_chief_complain),
admission
AS (SELECT admitted_date,
patient_id,
ADMISSION_ID,
admission_type AS encounter_type,
patient_pomr_id,
hospital_id,
clinic_name
FROM ad_request.admlm_admission
WHERE direct_admission IS NULL
AND is_from_er != 1
AND created_date BETWEEN timestamp '2022-08-01 00:00:00' AND
timestamp '2022-08-30 00:00:00'),
ip_create_admission
AS (SELECT patientpomr,
dbms_lob.Substr(admitting_diagnosis, 2000, 1) diagnosis
FROM eh_ip.ehip_create_admission
WHERE created_on BETWEEN timestamp '2022-08-01 00:00:00' AND
timestamp '2022-08-30 00:00:00'),
discharge
AS (SELECT CASE
WHEN dischargevia = 1 THEN 'Private Vehicle'
WHEN dischargevia = 2 THEN 'Ambulatory'
WHEN dischargevia = 3 THEN 'Other'
ELSE ' Unknown'
END AS dischargevia,
pomrid,
modifiedon AS discharge_date,
conditionondischarge AS discharge_speciality
FROM eh_ndischarge.ehipd_dischargedetails
WHERE isactive = 1),
death
AS (SELECT dbms_lob.Substr(underlying_cause, 2000, 1) cause_of_death,
patientpomr
FROM eh_ip.ehip_death_detail),
empi
AS (SELECT id_number,
mrn
FROM rf_empi.emred_patients),
vitals
AS (SELECT PR.id,
PR.patient_pomr_id,
FS.field_code,
FS.value
FROM eh_commmon.ehcom_patient_record PR
left join eh_commmon.ehcom_flow_sheet_data FS
ON PR.id = FS.patient_record_id
WHERE PR.flow_sheet_code = 'vitals'
AND FS.time_stamp BETWEEN timestamp '2022-08-01 00:00:00' AND
timestamp '2022-08-30 00:00:00'),
leaves
AS (SELECT requesting_days,
visit_id,
ADM.PATIENT_POMR_ID
FROM ad_request.admlm_med_leave_final_print FP
left join ad_request.admlm_medical_leave ML
ON FP.request_id = ML.request_id
LEFT JOIN AD_REQUEST.ADMLM_ADMISSION ADM
ON ML.VISIT_ID = ADM.ADMISSION_ID
WHERE FP.leave_status = 5
AND ML.created_date BETWEEN timestamp '2022-08-01 00:00:00' AND
timestamp '2022-08-30 00:00:00'
AND ML.REQUESTING_DAYS IS NOT NULL)
SELECT DISTINCT encounter.encounter_number,
admission.encounter_type,
empi.id_number AS Patient_National_ID,
admission.patient_id AS umrn,
admission.admitted_date,
admission.hospital_id,
admission.clinic_name AS admission_speciality,
chief_complain.chief_complain,
leaves.requesting_days AS Duration_of_leave,
encounter.doctor_id,
encounter.doctor_name,
ip_create_admission.diagnosis,
discharge.dischargevia,
discharge.discharge_date,
discharge_speciality,
admission.clinic_name AS clinic,
death.cause_of_death
-- VITALS.field_code,
-- VITALS.value
FROM admission
left join empi
ON admission.patient_id = empi.mrn
left join encounter
ON admission.patient_pomr_id = encounter.encounter_number
left join ip_create_admission
ON admission.patient_pomr_id = ip_create_admission.patientpomr
--admission_request_numbrer with adt
left join discharge
ON admission.patient_pomr_id = discharge.pomrid
left join death
ON admission.patient_pomr_id = death.patientpomr
left join chief_complain
ON admission.patient_pomr_id = chief_complain.patient_pomr_id
left join leaves
ON admission.patient_pomr_id = leaves.PATIENT_POMR_ID
I tried adding with begin and end tags with declare key words but had no luck. Also is there a special way to insert variable using in to keyword when we need to insert it for between?

Include yet another CTE (I'm calling it dates) which is then cross-joined in another CTEs which utilize these values. Something like this:
WITH
dates (start_date, end_date) --> this is new CTE
AS (SELECT timestamp '2022-08-01 00:00:00',
timestamp '2022-08-30 00:00:00'
FROM dual),
encounter
AS (SELECT patient_pomr_id AS encounter_number,
patient_id AS umrn,
doctor_id,
doctor_name
FROM eh_pomr.ehpom_patient_pomr
CROSS JOIN dates d --> it is used here
WHERE created_on BETWEEN d.start_date AND d.end_date), --> like this
chief_complain
AS ..

This is from MSSQL, you can try converting this through OracleSQL
#dateFrom datetime = null,
#dateTo datetime = null,
DATEADD(D, 0, DATEDIFF(D, 0, #DateFrom))
AND DATEADD(D, 0, DATEDIFF(D, 0, #DateTo))

Related

How to optimize selection of pairs from one column of the table?

I'm using PostgreSQL 9.5.19, DBeaver 6.3.4
I have a table where one row is - user's name, place he attended, time when he was there
I need to select all pairs of places where any user was (if user was at place a and place b i need row like this: user, place a, place b, time at place a, time at place b)
The ponds table:
CREATE TABLE example.example (
tm timestamp NOT NULL,
place_name varchar NOT NULL,
user_name varchar NOT NULL
);
Some sample data:
INSERT INTO example.example (tm, place_name, user_name)
values
('2020-02-25 00:00:19.000', 'place_1', 'user_1'),
('2020-03-25 00:00:19.000', 'place_2', 'user_1'),
('2020-02-25 00:00:19.000', 'place_1', 'user_2'),
('2020-03-25 00:00:19.000', 'place_1', 'user_3'),
('2020-02-25 00:00:19.000', 'place_2', 'user_3');
I'm trying this script:
select
t.user_name
,t.place_name as r1_place
,max(t.tm) as r1_tm
,t2.place_name as r2_place
,min(t2.tm) as r2_tm
from example.example as t
join example.example as t2 on t.user_name = t2.user_name
and t.tm < t2.tm
and t.place_name <> t2.place_name
where t.tm between '2020-02-25 00:00:00' and '2020-03-25 15:00:00'
and t2.tm between '2020-02-25 00:00:00' and '2020-03-25 15:00:00'
group by t.user_name
, t.place_name
, t2.place_name
Seems like it gives me the right result, but it works really slow.
Can I optimize it somehow?
I would suggest trying indexes. For this query:
select t.user_name, t.place_name as r1_place, max(t.tm) as r1_tm,
t2.place_name as r2_place, min(t2.tm) as r2_tm
from schema.table t join
schema.table t2
on t.user_name = t2.user_name and
t.tm < t2.tm and
t.place_name <> t2.place_name
where t.tm between '2020-03-25 00:00:00' and '2020-03-25 15:00:00' and
t2.tm between '2020-03-25 00:00:00' and '2020-03-25 15:00:00'
group by t.user_name, t.place_name, t2.place_name
I would suggest an index on (tm, user_name, place_name) and on (user_name, tm, place_name) -- yes, both, one for each reference.
Colleague helped me to create window function:
select
subq.*
,EXTRACT(EPOCH FROM (subq.next_tm - subq.tm)) as seconds_diff
from (
select
t1.user_name,
t1.place_name,
t1.tm,
lead(t1.place_name) over w as next_place_name,
lead(t1.tm) over w as next_tm
from example.example as t1
window w as (partition by t1.user_name order by tm asc)
)subq
where
next_place_name is not null
and next_tm is not null
and place_name <> next_place_name
;

Aggregate for each day over time series, without using non-equijoin logic

Initial Question
Given the following dataset paired with a dates table:
MembershipId | ValidFromDate | ValidToDate
==========================================
0001 | 1997-01-01 | 2006-05-09
0002 | 1997-01-01 | 2017-05-12
0003 | 2005-06-02 | 2009-02-07
How many Memberships were open on any given day or timeseries of days?
Initial Answer
Following this question being asked here, this answer provided the necessary functionality:
select d.[Date]
,count(m.MembershipID) as MembershipCount
from DIM.[Date] as d
left join Memberships as m
on(d.[Date] between m.ValidFromDateKey and m.ValidToDateKey)
where d.CalendarYear = 2016
group by d.[Date]
order by d.[Date];
though a commenter remarked that There are other approaches when the non-equijoin takes too long.
Followup
As such, what would the equijoin only logic look like to replicate the output of the query above?
Progress So Far
From the answers provided so far I have come up with the below, which outperforms on the hardware I am working with across 3.2 million Membership records:
declare #s date = '20160101';
declare #e date = getdate();
with s as
(
select d.[Date] as d
,count(s.MembershipID) as s
from dbo.Dates as d
join dbo.Memberships as s
on d.[Date] = s.ValidFromDateKey
group by d.[Date]
)
,e as
(
select d.[Date] as d
,count(e.MembershipID) as e
from dbo.Dates as d
join dbo.Memberships as e
on d.[Date] = e.ValidToDateKey
group by d.[Date]
),c as
(
select isnull(s.d,e.d) as d
,sum(isnull(s.s,0) - isnull(e.e,0)) over (order by isnull(s.d,e.d)) as c
from s
full join e
on s.d = e.d
)
select d.[Date]
,c.c
from dbo.Dates as d
left join c
on d.[Date] = c.d
where d.[Date] between #s and #e
order by d.[Date]
;
Following on from that, to split this aggregate into constituent groups per day I have the following, which is also performing well:
declare #s date = '20160101';
declare #e date = getdate();
with s as
(
select d.[Date] as d
,s.MembershipGrouping as g
,count(s.MembershipID) as s
from dbo.Dates as d
join dbo.Memberships as s
on d.[Date] = s.ValidFromDateKey
group by d.[Date]
,s.MembershipGrouping
)
,e as
(
select d.[Date] as d
,e..MembershipGrouping as g
,count(e.MembershipID) as e
from dbo.Dates as d
join dbo.Memberships as e
on d.[Date] = e.ValidToDateKey
group by d.[Date]
,e.MembershipGrouping
),c as
(
select isnull(s.d,e.d) as d
,isnull(s.g,e.g) as g
,sum(isnull(s.s,0) - isnull(e.e,0)) over (partition by isnull(s.g,e.g) order by isnull(s.d,e.d)) as c
from s
full join e
on s.d = e.d
and s.g = e.g
)
select d.[Date]
,c.g
,c.c
from dbo.Dates as d
left join c
on d.[Date] = c.d
where d.[Date] between #s and #e
order by d.[Date]
,c.g
;
Can anyone improve on the above?
If most of your membership validity intervals are longer than few days, have a look at an answer by Martin Smith. That approach is likely to be faster.
When you take calendar table (DIM.[Date]) and left join it with Memberships, you may end up scanning the Memberships table for each date of the range. Even if there is an index on (ValidFromDate, ValidToDate), it may not be super useful.
It is easy to turn it around.
Scan the Memberships table only once and for each membership find those dates that are valid using CROSS APPLY.
Sample data
DECLARE #T TABLE (MembershipId int, ValidFromDate date, ValidToDate date);
INSERT INTO #T VALUES
(1, '1997-01-01', '2006-05-09'),
(2, '1997-01-01', '2017-05-12'),
(3, '2005-06-02', '2009-02-07');
DECLARE #RangeFrom date = '2006-01-01';
DECLARE #RangeTo date = '2006-12-31';
Query 1
SELECT
CA.dt
,COUNT(*) AS MembershipCount
FROM
#T AS Memberships
CROSS APPLY
(
SELECT dbo.Calendar.dt
FROM dbo.Calendar
WHERE
dbo.Calendar.dt >= Memberships.ValidFromDate
AND dbo.Calendar.dt <= Memberships.ValidToDate
AND dbo.Calendar.dt >= #RangeFrom
AND dbo.Calendar.dt <= #RangeTo
) AS CA
GROUP BY
CA.dt
ORDER BY
CA.dt
OPTION(RECOMPILE);
OPTION(RECOMPILE) is not really needed, I include it in all queries when I compare execution plans to be sure that I'm getting the latest plan when I play with the queries.
When I looked at the plan of this query I saw that the seek in the Calendar.dt table was using only ValidFromDate and ValidToDate, the #RangeFrom and #RangeTo were pushed to the residue predicate. It is not ideal. The optimiser is not smart enough to calculate maximum of two dates (ValidFromDate and #RangeFrom) and use that date as a starting point of the seek.
It is easy to help the optimiser:
Query 2
SELECT
CA.dt
,COUNT(*) AS MembershipCount
FROM
#T AS Memberships
CROSS APPLY
(
SELECT dbo.Calendar.dt
FROM dbo.Calendar
WHERE
dbo.Calendar.dt >=
CASE WHEN Memberships.ValidFromDate > #RangeFrom
THEN Memberships.ValidFromDate
ELSE #RangeFrom END
AND dbo.Calendar.dt <=
CASE WHEN Memberships.ValidToDate < #RangeTo
THEN Memberships.ValidToDate
ELSE #RangeTo END
) AS CA
GROUP BY
CA.dt
ORDER BY
CA.dt
OPTION(RECOMPILE)
;
In this query the seek is optimal and doesn't read dates that may be discarded later.
Finally, you may not need to scan the whole Memberships table.
We need only those rows where the given range of dates intersects with the valid range of the membership.
Query 3
SELECT
CA.dt
,COUNT(*) AS MembershipCount
FROM
#T AS Memberships
CROSS APPLY
(
SELECT dbo.Calendar.dt
FROM dbo.Calendar
WHERE
dbo.Calendar.dt >=
CASE WHEN Memberships.ValidFromDate > #RangeFrom
THEN Memberships.ValidFromDate
ELSE #RangeFrom END
AND dbo.Calendar.dt <=
CASE WHEN Memberships.ValidToDate < #RangeTo
THEN Memberships.ValidToDate
ELSE #RangeTo END
) AS CA
WHERE
Memberships.ValidToDate >= #RangeFrom
AND Memberships.ValidFromDate <= #RangeTo
GROUP BY
CA.dt
ORDER BY
CA.dt
OPTION(RECOMPILE)
;
Two intervals [a1;a2] and [b1;b2] intersect when
a2 >= b1 and a1 <= b2
These queries assume that Calendar table has an index on dt.
You should try and see what indexes are better for the Memberships table.
For the last query, if the table is rather large, most likely two separate indexes on ValidFromDate and on ValidToDate would be better than one index on (ValidFromDate, ValidToDate).
You should try different queries and measure their performance on the real hardware with real data. Performance may depend on the data distribution, how many memberships there are, what are their valid dates, how wide or narrow is the given range, etc.
I recommend to use a great tool called SQL Sentry Plan Explorer to analyse and compare execution plans. It is free. It shows a lot of useful stats, such as execution time and number of reads for each query. The screenshots above are from this tool.
On the assumption your date dimension contains all dates contained in all membership periods you can use something like the following.
The join is an equi join so can use hash join or merge join not just nested loops (which will execute the inside sub tree once for each outer row).
Assuming index on (ValidToDate) include(ValidFromDate) or reverse this can use a single seek against Memberships and a single scan of the date dimension. The below has an elapsed time of less than a second for me to return the results for a year against a table with 3.2 million members and general active membership of 1.4 million (script)
DECLARE #StartDate DATE = '2016-01-01',
#EndDate DATE = '2016-12-31';
WITH MD
AS (SELECT Date,
SUM(Adj) AS MemberDelta
FROM Memberships
CROSS APPLY (VALUES ( ValidFromDate, +1),
--Membership count decremented day after the ValidToDate
(DATEADD(DAY, 1, ValidToDate), -1) ) V(Date, Adj)
WHERE
--Members already expired before the time range of interest can be ignored
ValidToDate >= #StartDate
AND
--Members whose membership starts after the time range of interest can be ignored
ValidFromDate <= #EndDate
GROUP BY Date),
MC
AS (SELECT DD.DateKey,
SUM(MemberDelta) OVER (ORDER BY DD.DateKey ROWS UNBOUNDED PRECEDING) AS CountOfNonIgnoredMembers
FROM DIM_DATE DD
LEFT JOIN MD
ON MD.Date = DD.DateKey)
SELECT DateKey,
CountOfNonIgnoredMembers AS MembershipCount
FROM MC
WHERE DateKey BETWEEN #StartDate AND #EndDate
ORDER BY DateKey
Demo (uses extended period as the calendar year of 2016 isn't very interesting with the example data)
One approach is to first use an INNER JOIN to find the set of matches and COUNT() to project MemberCount GROUPed BY DateKey, then UNION ALL with the same set of dates, with a 0 on that projection for the count of members for each date. The last step is to SUM() the MemberCount of this union, and GROUP BY DateKey. As requested, this avoids LEFT JOIN and NOT EXISTS. As another member pointed out, this is not an equi-join, because we need to use a range, but I think it does what you intend.
This will serve up 1 year's worth of data with around 100k logical reads. On an ordinary laptop with a spinning disk, from cold cache, it serves 1 month in under a second (with correct counts).
Here is an example that creates 3.3 million rows of random duration. The query at the bottom returns one month's worth of data.
--Stay quiet for a moment
SET NOCOUNT ON
SET STATISTICS IO OFF
SET STATISTICS TIME OFF
--Clean up if re-running
DROP TABLE IF EXISTS DIM_DATE
DROP TABLE IF EXISTS FACT_MEMBER
--Date dimension
CREATE TABLE DIM_DATE
(
DateKey DATE NOT NULL
)
--Membership fact
CREATE TABLE FACT_MEMBER
(
MembershipId INT NOT NULL
, ValidFromDateKey DATE NOT NULL
, ValidToDateKey DATE NOT NULL
)
--Populate Date dimension from 2001 through end of 2018
DECLARE #startDate DATE = '2001-01-01'
DECLARE #endDate DATE = '2018-12-31'
;WITH CTE_DATE AS
(
SELECT #startDate AS DateKey
UNION ALL
SELECT
DATEADD(DAY, 1, DateKey)
FROM
CTE_DATE AS D
WHERE
D.DateKey < #endDate
)
INSERT INTO
DIM_DATE
(
DateKey
)
SELECT
D.DateKey
FROM
CTE_DATE AS D
OPTION (MAXRECURSION 32767)
--Populate Membership fact with members having a random membership length from 1 to 36 months
;WITH CTE_DATE AS
(
SELECT #startDate AS DateKey
UNION ALL
SELECT
DATEADD(DAY, 1, DateKey)
FROM
CTE_DATE AS D
WHERE
D.DateKey < #endDate
)
,CTE_MEMBER AS
(
SELECT 1 AS MembershipId
UNION ALL
SELECT MembershipId + 1 FROM CTE_MEMBER WHERE MembershipId < 500
)
,
CTE_MEMBERSHIP
AS
(
SELECT
ROW_NUMBER() OVER (ORDER BY NEWID()) AS MembershipId
, D.DateKey AS ValidFromDateKey
FROM
CTE_DATE AS D
CROSS JOIN CTE_MEMBER AS M
)
INSERT INTO
FACT_MEMBER
(
MembershipId
, ValidFromDateKey
, ValidToDateKey
)
SELECT
M.MembershipId
, M.ValidFromDateKey
, DATEADD(MONTH, FLOOR(RAND(CHECKSUM(NEWID())) * (36-1)+1), M.ValidFromDateKey) AS ValidToDateKey
FROM
CTE_MEMBERSHIP AS M
OPTION (MAXRECURSION 32767)
--Add clustered Primary Key to Date dimension
ALTER TABLE DIM_DATE ADD CONSTRAINT PK_DATE PRIMARY KEY CLUSTERED
(
DateKey ASC
)
--Index
--(Optimize in your spare time)
DROP INDEX IF EXISTS SK_FACT_MEMBER ON FACT_MEMBER
CREATE CLUSTERED INDEX SK_FACT_MEMBER ON FACT_MEMBER
(
ValidFromDateKey ASC
, ValidToDateKey ASC
, MembershipId ASC
)
RETURN
--Start test
--Emit stats
SET STATISTICS IO ON
SET STATISTICS TIME ON
--Establish range of dates
DECLARE
#rangeStartDate DATE = '2010-01-01'
, #rangeEndDate DATE = '2010-01-31'
--UNION the count of members for a specific date range with the "zero" set for the same range, and SUM() the counts
;WITH CTE_MEMBER
AS
(
SELECT
D.DateKey
, COUNT(*) AS MembershipCount
FROM
DIM_DATE AS D
INNER JOIN FACT_MEMBER AS M ON
M.ValidFromDateKey <= #rangeEndDate
AND M.ValidToDateKey >= #rangeStartDate
AND D.DateKey BETWEEN M.ValidFromDateKey AND M.ValidToDateKey
WHERE
D.DateKey BETWEEN #rangeStartDate AND #rangeEndDate
GROUP BY
D.DateKey
UNION ALL
SELECT
D.DateKey
, 0 AS MembershipCount
FROM
DIM_DATE AS D
WHERE
D.DateKey BETWEEN #rangeStartDate AND #rangeEndDate
)
SELECT
M.DateKey
, SUM(M.MembershipCount) AS MembershipCount
FROM
CTE_MEMBER AS M
GROUP BY
M.DateKey
ORDER BY
M.DateKey ASC
OPTION (RECOMPILE, MAXDOP 1)
Here's how I'd solve this problem with equijoin:
--data generation
declare #Membership table (MembershipId varchar(10), ValidFromDate date, ValidToDate date)
insert into #Membership values
('0001', '1997-01-01', '2006-05-09'),
('0002', '1997-01-01', '2017-05-12'),
('0003', '2005-06-02', '2009-02-07')
declare #startDate date, #endDate date
select #startDate = MIN(ValidFromDate), #endDate = max(ValidToDate) from #Membership
--in order to use equijoin I need all days between min date and max date from Membership table (both columns)
;with cte as (
select #startDate [date]
union all
select DATEADD(day, 1, [date]) from cte
where [date] < #endDate
)
--in this query, we will assign value to each day:
--one, if project started on that day
--minus one, if project ended on that day
--then, it's enough to (cumulative) sum all this values to get how many projects were ongoing on particular day
select [date],
sum(case when [DATE] = ValidFromDate then 1 else 0 end +
case when [DATE] = ValidToDate then -1 else 0 end)
over (order by [date] rows between unbounded preceding and current row)
from cte [c]
left join #Membership [m]
on [c].[date] = [m].ValidFromDate or [c].[date] = [m].ValidToDate
option (maxrecursion 0)
Here's another solution:
--data generation
declare #Membership table (MembershipId varchar(10), ValidFromDate date, ValidToDate date)
insert into #Membership values
('0001', '1997-01-01', '2006-05-09'),
('0002', '1997-01-01', '2017-05-12'),
('0003', '2005-06-02', '2009-02-07')
;with cte as (
select CAST('2016-01-01' as date) [date]
union all
select DATEADD(day, 1, [date]) from cte
where [date] < '2016-12-31'
)
select [date],
(select COUNT(*) from #Membership where ValidFromDate < [date]) -
(select COUNT(*) from #Membership where ValidToDate < [date]) [ongoing]
from cte
option (maxrecursion 0)
Pay attention, I think #PittsburghDBA is right when it says that current query return wrong result.
The last day of membership is not counted and so final sum is lower than it should be.
I have corrected it in this version.
This should improve a bit your actual progress:
declare #s date = '20160101';
declare #e date = getdate();
with
x as (
select d, sum(c) c
from (
select ValidFromDateKey d, count(MembershipID) c
from Memberships
group by ValidFromDateKey
union all
-- dateadd needed to count last day of membership too!!
select dateadd(dd, 1, ValidToDateKey) d, -count(MembershipID) c
from Memberships
group by ValidToDateKey
)x
group by d
),
c as
(
select d, sum(x.c) over (order by d) as c
from x
)
select d.day, c cnt
from calendar d
left join c on d.day = c.d
where d.day between #s and #e
order by d.day;
First of all, your query yields '1' as MembershipCount even if no active membership exists for the given date.
You should return SUM(CASE WHEN m.MembershipID IS NOT NULL THEN 1 ELSE 0 END) AS MembershipCount.
For optimal performance create an index on Memberships(ValidFromDateKey, ValidToDateKey, MembershipId) and another on DIM.[Date](CalendarYear, DateKey).
With that done, the optimal query shall be:
DECLARE #CalendarYear INT = 2000
SELECT dim.DateKey, SUM(CASE WHEN con.MembershipID IS NOT NULL THEN 1 ELSE 0 END) AS MembershipCount
FROM
DIM.[Date] dim
LEFT OUTER JOIN (
SELECT ValidFromDateKey, ValidToDateKey, MembershipID
FROM Memberships
WHERE
ValidFromDateKey <= CONVERT(DATETIME, CONVERT(VARCHAR, #CalendarYear) + '1231')
AND ValidToDateKey >= CONVERT(DATETIME, CONVERT(VARCHAR, #CalendarYear) + '0101')
) con
ON dim.DateKey BETWEEN con.ValidFromDateKey AND con.ValidToDateKey
WHERE dim.CalendarYear = #CalendarYear
GROUP BY dim.DateKey
ORDER BY dim.DateKey
Now, for your last question, what would be the equijoin equivalent query.
There is NO WAY you can rewrite this as a non-equijoin!
Equijoin doesn't imply using join sintax. Equijoin implies using an equals predicate, whatever the sintax.
Your query yields a range comparison, hence equals doesn't apply: a between or similar is required.

JOIN Subselect range in ON Condition? (Why and behavioral benefits?)

The statement in question is working correctly, I just need to understand why and how!
I have encountered the following Syntax in a statement which gets information for employee analysis.
Never seen anything like it and cannot find anything about the behavior using google. Hope you people can help me understand how this works and how to use it.
Table 1 Employees (EmployeeID, Name, Birthdate, ...)
Table 2 Contracts (ContractID, EmployeeID, Startdate, Enddate, ...)
Table 3 Time-models (TimeID, ContractID, EmployeeID, MonthlyDate, ...)
Table 4 Insurance (InsuranceID, ContractID, EmployeeID, JobType, ...)
The subselect in the code later on outputs:
Columname MonthlyDate
Values('2016-02-01 00:00:00.000',
'2016-04-01 00:00:00.000',
'2016-01-01 00:00:00.000',
'2016-03-01 00:00:00.000',
'2016-06-01 00:00:00.000',
'2016-05-01 00:00:00.000',
'2016-07-01 00:00:00.000')
The SQL in question looks like this:
SELECT E.EmployeeID+C.ContractID, D.Date, E.Name
FROM Employees AS E
LEFT OUTER JOIN Contract AS C ON E.EmployeeID = C.Employee ID
--This is where the SQL I was talking about starts
LEFT OUTER JOIN (
SELECT MonthlyDate
FROM Time-models
WHERE MonthlyDate >= Convert(DATETIME, '2016-01-01', 102)) AS D
ON (D.MonthlyDate >= C.ContractStartDate AND D.MonthlyDate <= C.ContractEndDate)
/*I can not explain this On-Condition*/
LEFT OUTER JOIN Insurance as I
ON (I.EmployeeID = E.EmployeeID AND I.ContractID = C.ContractID)
WHERE I.JobType = 'Clerk'
AND (I.InsuranceStartDate <= D.MonthlyDate AND I.InsuranceEndDate >= D.MonthlyDate)
/* I can not understand which compare values are used in D.MonthlyDate
because of the the vague ON-Condition */
The WHERE condition of your query is turning the LEFT JOIN into an INNER JOIN. (If a row doesn't match then the I columns will be NULL and fail the WHERE conditions.)
In addition:
You do not need the subquery at all.
SQL Server recognizes dates in the format YYYYMMDD and almost always in the format YYYY-MM-DD without conversion.
So, you might as well re-write the query as:
SELECT E.EmployeeID+C.ContractID, D.Date, E.Name
FROM Employees E JOIN
Contract C
ON E.EmployeeID = C.Employee ID JOI
[Time-models] d
ON D.MonthlyDate >= C.ContractStartDate AND
D.MonthlyDate <= C.ContractEndDate AND
MonthlyDate >= '2016-01-01' JOIN
Insurance as I
ON I.EmployeeID = E.EmployeeID AND
I.ContractID = C.ContractID
WHERE I.JobType = 'Clerk' AND
I.InsuranceStartDate <= D.MonthlyDate AND
I.InsuranceEndDate >= D.MonthlyDate;
I'm not sure if this answers your question, though.
"I don't understand how one single value can be matched."
No this apparently will match a number of rows, not a single one. Consider this simple example
select *
from (
values
(1, cast('2016-02-02' as date), cast('2016-03-02' as date))
,(2, cast('2016-02-02' as date), cast('2016-05-02' as date))
,(3, cast('2016-02-02' as date), cast('2016-02-04' as date))
) C(id,ContractStartDate, ContractEndDate)
left join (
values
('2016-02-01')
,('2016-04-01')
,('2016-01-01')
,('2016-03-01')
,('2016-06-01')
,('2016-05-01')
,('2016-07-01')
) D(MonthlyDate)
on D.MonthlyDate >= C.ContractStartDate AND D.MonthlyDate <= C.ContractEndDate
;
The query will return 1 to 3 rows from D for every C row.

Multiple columns are specified in an aggregated expression containing an outer reference TSQL

I have the following query:
SELECT
FileNumber,
dbo.GetLocalDateTimeFunc(SentDate) AS SentDate
INTO #tmp1
FROM FileMain f
JOIN FileActions fa ON f.FileID = fa.FileID
WHERE ActionDefID = 15 AND SentDate IS NOT NULL
SELECT
FileNumber,
dbo.GetLocalDateTimeFunc(ReceivedDate) AS ReceivedDate
INTO #tmp2
FROM FileMain f
JOIN FileActions fa ON f.FileID = fa.FileID
WHERE ActionDefID = 23 AND ReceivedDate IS NOT NULL
SELECT DISTINCT
o.Name AS Company, fm.FileNumber, pc.Name as Client,
p.State, c.County, t1.SentDate, t2.ReceivedDate,
(SELECT sum(case
when dateadd(day, datediff(day, 0, t1.SentDate), 0) = dateadd(day, datediff(day, 0, t2.ReceivedDate), 0) then
datediff(second, t1.SentDate, t2.ReceivedDate)
when [DATE] = dateadd(day, datediff(day, 0, t1.SentDate), 0) then
case
when t1.SentDate > [DATE] + begin_time then datediff(second, t1.SentDate, [DATE] + end_time)
else duration
end
when [DATE] = dateadd(day, datediff(day, 0, t2.ReceivedDate), 0) then
case
when t2.ReceivedDate < [DATE] + end_time then datediff(second, [DATE] + begin_time, t2.ReceivedDate)
else duration
end
else duration
end
)
/ 60.0 / 60.0
FROM F_TABLE_DATE(t1.SentDate, t2.ReceivedDate) d
INNER JOIN Unisource_Calendar c ON d.WEEKDAY_NAME_LONG = c.day_name)
FROM Office o
JOIN PartnerCompany pc ON o.OfficeID = pc.OfficeID
JOIN FileMain fm ON o.OfficeID = fm.OfficeID AND pc.PartnerCompanyID = fm.ClientID
JOIN Property p ON p.FileID = fm.FileID
JOIN County c ON p.CountyID = c.CountyID
JOIN FileActions fa ON fm.FileID = fa.FileID
JOIN #tmp1 t1 ON t1.FileNumber = fm.FileNumber
JOIN #tmp2 t2 ON t2.FileNumber = fm.FileNumber
WHERE p.State IN ('AR', 'CA', 'CO', 'DE', 'DC', 'FL', 'GA', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NJ', 'NV', 'NH', 'NY', 'NC', 'ND', 'OH', 'OK', 'PA', 'RI', 'SC', 'TN', 'TX', 'VA', 'WV', 'WI')
ORDER BY SentDate, FileNumber DESC
I'm getting the following error on my subquery:
Multiple columns are specified in an aggregated expression containing an outer reference. If an expression being aggregated contains an outer reference, then that outer reference must be the only column referenced in the expression.
Does anybody know how to fix this?
Or if someone has a function that can calculate datetime differences while excluding business hours and weekends that would help also. Thanks!
I would recommend you to simplify your code using CTEs for a start (enumeration of ALL tables distracts to give a precise statement). Also you should try your aggregate SUM function as a part of PARTITION by expression. This would probably help to avoid the problem you mentioned.
From what I can glean, the table function F_Table_Date is returning DATE or DATETIME rows for each day between the two parameters, and the UnisourceCalendar Is likely a list of work days (to allot for holidays as you mentioned). If this is the case, and UnisourceCalendar also returns a DATE or DATETIME column, consider this for your subquery:
SELECT (COUNT(*) * 60*60*24)
+ (
SELECT COUNT(*)
FROM UnisourceCalendar
WHERE [DATE] = CAST(CONVERT(VARCHAR,t1.SentDate+1,112) AS DATETIME)
)*DATEDIFF(SS,t1.SentDate,CAST(CONVERT(VARCHAR,t1.SentDate+1,112) AS DATETIME))
+ (
SELECT COUNT(*)
FROM UnisourceCalendar
WHERE [DATE] = CAST(CONVERT(VARCHAR,t1.SentDate+1,112) AS DATETIME)
)*DATEDIFF(SS,CAST(CONVERT(VARCHAR,t2.ReceivedDate,112) AS DATETIME),t2.ReceivedDate)
FROM UnisourceCalendar C
WHERE C.[DATE] > t1.SentDate AND C.[DATE] < t2.ReceivedDate
GROUP BY t1.SentDate, t2.ReceivedDate
What's at Play here:
Presuming 1 row per business day from UnisourceCalendar, any other join is superfluous.
A count is all that's needed, then.
The datediff of a converted/cast value of one date against itself using style 112 strips the time out and is recast as midnight, thus allowing us to get the seconds to next midnight from the sent date, and from the previous midnight of the received date, but only if each date is in the unisource calendar (mulitply by count, if 0, then no seconds added, if 1, then add the extra seconds).
Output is presuming that you will be dividing the results down to hours outside the subquery as you are.
Complicated? Sure, but it should output the results you're looking for in relatively short order.

Finding overlapping dates

I have a set of Meeting rooms and meetings in that having start date and end Date. A set of meeting rooms belong to a building.
The meeting details are kept in MeetingDetail table having a startDate and endDate.
Now I want to fire a report between two time period say reportStartDate and reportEndDate, which finds me the time slots in which all the meeting rooms are booked for a given building
Table structure
MEETING_ROOM - ID, ROOMNAME, BUILDING_NO
MEETING_DETAIL - ID, MEETING_ROOM_ID, START_DATE, END_DATE
The query has to be fired for reportStartDate and REportEndDate
Just to clarify further, the aim is to find all the time slots in which all the meeting rooms were booked in a given time period of reportStartDate and reportEndDate
For SQL Server 2005+ you could try the following (see note at the end for mysql)
WITH TIME_POINTS (POINT_P) AS
(SELECT DISTINCT START_DATE FROM MEETING_DETAIL
WHERE START_DATE > #reportStartDate AND START_DATE < #reportEndDate
UNION SELECT DISTINCT END_DATE FROM MEETING_DETAIL
WHERE END_DATE > #reportStartDate AND END_DATE < #reportEndDate
UNION SELECT #reportEndDate
UNION SELECT #reportStartDate),
WITH TIME_SLICE (START_T, END_T) AS
(SELECT A.POINT_P, MIN(B.POINT_P) FROM
TIMEPOINTS A
INNER JOIN TIMEPOINTS B ON A.POINT_P > B.POINT_P
GROUP BY A.POINT_P),
WITH SLICE_MEETINGS (START_T, END_T, MEETING_ROOM_ID, BUILDING_NO) AS
(SELECT START_T, END_T, MEETING_ROOM_ID, BUILDING_NO FROM
TIME_SLICE A
INNER JOIN MEETING_DETAIL B ON B.START_DATE <= A.START_T AND B.END_DATE >= B.END_T
INNER JOIN MEETING_ROOM C ON B.MEETING_ROOM_ID = C.ID),
WITH SLICE_COUNT (START_T, END_T, BUILDING_NO, ROOMS_C) AS
(SELECT START_T, END_T, BUILDING_NO, COUNT(MEETING_ROOM_ID) FROM
SLICE_MEETINGS
GROUP BY START_T, END_T, BUILDING_NO),
WITH ROOMS_BUILDING (BUILDING_NO, ROOMS_C) AS
(SELECT BUILDING_NO, COUNT(ID) FROM
MEETING_ROOM
GROUP BY BUILDING_NO)
SELECT B.BUILDING_NO, A.START_T, A.END_T
FROM SLICE_COUNT A.
INNER JOIN ROOMS_BUILDING B WHERE A.BUILDING_NO = B.BUILDING_NO AND B.ROOMS_C = A.ROOMS_C;
what it does is (each step corresponds to each CTE definition above)
Get all the time markers, i.e. end or start times
Get all time slices i.e. the smallest unit of time between which there is no other time marker (i.e. no meetings start in a time slice, it's either at the beginning or at the end of a time slice)
Get meetings for each time slice, so now you get something like
10.30 11.00 Room1 BuildingA
10.30 11.00 Room2 BuildingA
11.00 12.00 Room1 BuildingA
Get counts of rooms booked per building per time slice
Filter out timeslice-building combinations that match the number of rooms in each building
Edit
Since mysql doesn't support the WITH clause you'll have to construct views for each (of the 5) WITH clases above. everything else would remain the same.
After reading your comment, I think I understand the problem a bit better. As a first step I would generate a matrix of meeting rooms and time slots using cross join:
select *
from (
select distinct start_date
, end_date
from #meeting_detail
) ts
cross join
#meeting_room mr
Then, for each cell in the matrix, add meetings in that timeslot:
left join
#meeting_detail md
on mr.id = md.meeting_room_id
and ts.start_date < md.end_date
and md.start_date < ts.end_date
And then demand that there are no free rooms. For example, by saying that the left join must succeed for all rooms and time slots. A left join succeeds if any field is not null:
group by
mr.building_no
, ts.start_date
, ts.end_date
having max(case when md.meeting_room_id is null
then 1 else 0 end) = 0
Here's a complete working example. It's written for SQL Server, and the table variables (#meeting_detail) won't work in MySQL. But the report generating query should work in most databases:
set nocount on
declare #meeting_room table (id int, roomname varchar(50),
building_no int)
declare #meeting_detail table (meeting_room_id int,
start_date datetime, end_date datetime)
insert #meeting_room (id, roomname, building_no)
select 1, 'Kitchen', 6
union all select 2, 'Ballroom', 6
union all select 3, 'Conservatory', 7
union all select 4, 'Dining Room', 7
insert #meeting_detail (meeting_room_id, start_date, end_date)
select 1, '2010-08-01 9:00', '2010-08-01 10:00'
union all select 1, '2010-08-01 10:00', '2010-08-01 11:00'
union all select 2, '2010-08-01 10:00', '2010-08-01 11:00'
union all select 3, '2010-08-01 10:00', '2010-08-01 11:00'
select mr.building_no
, ts.start_date
, ts.end_date
from (
select distinct start_date
, end_date
from #meeting_detail
) ts
cross join
#meeting_room mr
left join
#meeting_detail md
on mr.id = md.meeting_room_id
and ts.start_date < md.end_date
and md.start_date < ts.end_date
group by
mr.building_no
, ts.start_date
, ts.end_date
having max(case when md.meeting_room_id is null
then 1 else 0 end) = 0
This prints:
building_no start end
6 2010-08-01 10:00:00.000 2010-08-01 11:00:00.000