SQL update with a LAG

SQL update with a LAG - sql

I have the following update that updates a record based on a prior record -
WITH CTE AS(
SELECT
patient,
start,
CASE
WHEN
ISNULL(start, '') = ''
AND cd = '3'
AND LAG([thru_dt]) OVER (PARTITION BY [patient] ORDER BY cast(claimno AS int) ASC) BETWEEN CONVERT(VARCHAR, DATEADD(DAY, -30, thru_dt), 112) AND thru_dt
AND LAG([cd]) OVER (PARTITION BY [patient] ORDER BY cast(claimno AS int) ASC) = '30'
THEN LAG([start]) OVER (PARTITION BY [patient] ORDER BY cast(claimno AS int) ASC)
WHEN ISNULL(hhstrtdt, '') = '' AND
ROW_NUMBER() OVER(PARTITION BY patient ORDER BY cast(claimno AS int) DESC) = 1
AND LAG([cd]) OVER (PARTITION BY [patient] ORDER BY cast(claimno AS int) ASC) = '30'
AND LAG([thru_dt]) OVER (PARTITION BY [patient] ORDER BY cast(claimno AS int) ASC) BETWEEN CONVERT(VARCHAR, DATEADD(DAY, -30, thru_dt), 112) AND thru_dt
THEN LAG([start]) OVER (PARTITION BY [patient] ORDER BY cast(claimno AS int) ASC)
ELSE start
END AS new_start
FROM table
)
UPDATE CTE
SET start = new_start
This query only updates one record at a time. For example, if I had this input -
start patient cd
20190307 497863693 30
NULL 497863693 30
NULL 497863693 30
NULL 497863693 30
To update, all 3 rows with a NULL start with the value of the first row, I would have to run the query 3 times.
The output would be -
start patient cd
20190307 497863693 30
20190307 497863693 30
20190307 497863693 30
20190307 497863693 30
Is there a way to get the query to update all rows for the above patient instead of it doing it one by one? I could keep a count of the NULLs in the table and make the update stop running once the no.of NULLs in the table stopped decreasing but that does not seem like a good way of doing it.

You can use a cumulative max for this. If I understand correctly, you want to reset NULL values on rows with cd = 30 to latest start value when cd = '30':
with toupdate as (
select t.*,
max(start) over (partition by patientid order by cast(claimno AS int)) as imputed_start
from t
where cd = '30'
)
update toupdate
set start = imputed_start
where start is null or imputed_start <> start;

I think the simplest way to update the whole table at once would be something like this
with rn_cte as (
select * , row_number() over (partition by patient order by (select null)) rn
from tTable)
update tTable
set start=NULl
where rn>=2;

Related

Query without WHILE Loop

We have appointment table as shown below. Each appointment need to be categorized as "New" or "Followup". Any appointment (for a patient) within 30 days of first appointment (of that patient) is Followup. After 30 days, appointment is again "New". Any appointment within 30 days become "Followup".
I am currently doing this by typing while loop.
How to achieve this without WHILE loop?
Table
CREATE TABLE #Appt1 (ApptID INT, PatientID INT, ApptDate DATE)
INSERT INTO #Appt1
SELECT 1,101,'2020-01-05' UNION
SELECT 2,505,'2020-01-06' UNION
SELECT 3,505,'2020-01-10' UNION
SELECT 4,505,'2020-01-20' UNION
SELECT 5,101,'2020-01-25' UNION
SELECT 6,101,'2020-02-12' UNION
SELECT 7,101,'2020-02-20' UNION
SELECT 8,101,'2020-03-30' UNION
SELECT 9,303,'2020-01-28' UNION
SELECT 10,303,'2020-02-02'

You need to use recursive query.
The 30days period is counted starting from prev(and no it is not possible to do it without recursion/quirky update/loop). That is why all the existing answer using only ROW_NUMBER failed.
WITH f AS (
SELECT *, rn = ROW_NUMBER() OVER(PARTITION BY PatientId ORDER BY ApptDate)
FROM Appt1
), rec AS (
SELECT Category = CAST('New' AS NVARCHAR(20)), ApptId, PatientId, ApptDate, rn, startDate = ApptDate
FROM f
WHERE rn = 1
UNION ALL
SELECT CAST(CASE WHEN DATEDIFF(DAY, rec.startDate,f.ApptDate) <= 30 THEN N'FollowUp' ELSE N'New' END AS NVARCHAR(20)),
f.ApptId,f.PatientId,f.ApptDate, f.rn,
CASE WHEN DATEDIFF(DAY, rec.startDate, f.ApptDate) <= 30 THEN rec.startDate ELSE f.ApptDate END
FROM rec
JOIN f
ON rec.rn = f.rn - 1
AND rec.PatientId = f.PatientId
)
SELECT ApptId, PatientId, ApptDate, Category
FROM rec
ORDER BY PatientId, ApptDate;
db<>fiddle demo
Output:
+---------+------------+-------------+----------+
| ApptId | PatientId | ApptDate | Category |
+---------+------------+-------------+----------+
| 1 | 101 | 2020-01-05 | New |
| 5 | 101 | 2020-01-25 | FollowUp |
| 6 | 101 | 2020-02-12 | New |
| 7 | 101 | 2020-02-20 | FollowUp |
| 8 | 101 | 2020-03-30 | New |
| 9 | 303 | 2020-01-28 | New |
| 10 | 303 | 2020-02-02 | FollowUp |
| 2 | 505 | 2020-01-06 | New |
| 3 | 505 | 2020-01-10 | FollowUp |
| 4 | 505 | 2020-01-20 | FollowUp |
+---------+------------+-------------+----------+
How it works:
f - get starting point(anchor - per every PatientId)
rec - recursibe part, if the difference between current value and prev is > 30 change the category and starting point, in context of PatientId
Main - display sorted resultset
Similar class:
Conditional SUM on Oracle - Capping a windowed function
Session window (Azure Stream Analytics)
Running Total until specific condition is true - Quirky update
Addendum
Do not ever use this code on production!
But another option, that is worth mentioning besides using cte, is to use temp table and update in "rounds"
It could be done in "single" round(quirky update):
CREATE TABLE Appt_temp (ApptID INT , PatientID INT, ApptDate DATE, Category NVARCHAR(10))
INSERT INTO Appt_temp(ApptId, PatientId, ApptDate)
SELECT ApptId, PatientId, ApptDate
FROM Appt1;
CREATE CLUSTERED INDEX Idx_appt ON Appt_temp(PatientID, ApptDate);
Query:
DECLARE #PatientId INT = 0,
#PrevPatientId INT,
#FirstApptDate DATE = NULL;
UPDATE Appt_temp
SET #PrevPatientId = #PatientId
,#PatientId = PatientID
,#FirstApptDate = CASE WHEN #PrevPatientId <> #PatientId THEN ApptDate
WHEN DATEDIFF(DAY, #FirstApptDate, ApptDate)>30 THEN ApptDate
ELSE #FirstApptDate
END
,Category = CASE WHEN #PrevPatientId <> #PatientId THEN 'New'
WHEN #FirstApptDate = ApptDate THEN 'New'
ELSE 'FollowUp'
END
FROM Appt_temp WITH(INDEX(Idx_appt))
OPTION (MAXDOP 1);
SELECT * FROM Appt_temp ORDER BY PatientId, ApptDate;
db<>fiddle Quirky update

You could do this with a recursive cte. You should first order by apptDate within each patient. That can be accomplished by a run-of-the-mill cte.
Then, in the anchor portion of your recursive cte, select the first ordering for each patient, mark the status as 'new', and also mark the apptDate as the date of the most recent 'new' record.
In the recursive portion of your recursive cte, increment to the next appointment, calculate the difference in days between the present appointment and the most recent 'new' appointment date. If it's greater than 30 days, mark it 'new' and reset the most recent new appointment date. Otherwise mark it as 'follow up' and just pass along the existing days since new appointment date.
Finallly, in the base query, just select the columns you want.
with orderings as (
select *,
rn = row_number() over(
partition by patientId
order by apptDate
)
from #appt1 a
),
markings as (
select apptId,
patientId,
apptDate,
rn,
type = convert(varchar(10),'new'),
dateOfNew = apptDate
from orderings
where rn = 1
union all
select o.apptId, o.patientId, o.apptDate, o.rn,
type = convert(varchar(10),iif(ap.daysSinceNew > 30, 'new', 'follow up')),
dateOfNew = iif(ap.daysSinceNew > 30, o.apptDate, m.dateOfNew)
from markings m
join orderings o
on m.patientId = o.patientId
and m.rn + 1 = o.rn
cross apply (select daysSinceNew = datediff(day, m.dateOfNew, o.apptDate)) ap
)
select apptId, patientId, apptDate, type
from markings
order by patientId, rn;
I should mention that I initially deleted this answer because Abhijeet Khandagale's answer seemed to meet your needs with a simpler query (after reworking it a bit). But with your comment to him about your business requirement and your added sample data, I undeleted mine because believe this one meets your needs.

I'm not sure that it's exactly what you implemented. But another option, that is worth mentioning besides using cte, is to use temp table and update in "rounds". So we are going to update temp table while all statuses are not set correctly and build result in an iterative way. We can control number of iteration using simply local variable.
So we split each iteration into two stages.
Set all Followup values that are near to New records. That's pretty easy to do just using right filter.
For the rest of the records that dont have status set we can select first in group with same PatientID. And say that they are new since they not processed by the first stage.
So
CREATE TABLE #Appt2 (ApptID INT, PatientID INT, ApptDate DATE, AppStatus nvarchar(100))
select * from #Appt1
insert into #Appt2 (ApptID, PatientID, ApptDate, AppStatus)
select a1.ApptID, a1.PatientID, a1.ApptDate, null from #Appt1 a1
declare #limit int = 0;
while (exists(select * from #Appt2 where AppStatus IS NULL) and #limit < 1000)
begin
set #limit = #limit+1;
update a2
set
a2.AppStatus = IIF(exists(
select *
from #Appt2 a
where
0 > DATEDIFF(day, a2.ApptDate, a.ApptDate)
and DATEDIFF(day, a2.ApptDate, a.ApptDate) > -30
and a.ApptID != a2.ApptID
and a.PatientID = a2.PatientID
and a.AppStatus = 'New'
), 'Followup', a2.AppStatus)
from #Appt2 a2
--select * from #Appt2
update a2
set a2.AppStatus = 'New'
from #Appt2 a2 join (select a.*, ROW_NUMBER() over (Partition By PatientId order by ApptId) rn from (select * from #Appt2 where AppStatus IS NULL) a) ar
on a2.ApptID = ar.ApptID
and ar.rn = 1
--select * from #Appt2
end
select * from #Appt2 order by PatientID, ApptDate
drop table #Appt1
drop table #Appt2
Update. Read the comment provided by Lukasz. It's by far smarter way. I leave my answer just as an idea.

I believe the recursive common expression is great way to optimize queries avoiding loops, but in some cases it can lead to bad performance and should be avoided if possible.
I use the code below to solve the issue and test it will more values, but encourage you to test it with your real data, too.
WITH DataSource AS
(
SELECT *
,CEILING(DATEDIFF(DAY, MIN([ApptDate]) OVER (PARTITION BY [PatientID]), [ApptDate]) * 1.0 / 30 + 0.000001) AS [GroupID]
FROM #Appt1
)
SELECT *
,IIF(ROW_NUMBER() OVER (PARTITION BY [PatientID], [GroupID] ORDER BY [ApptDate]) = 1, 'New', 'Followup')
FROM DataSource
ORDER BY [PatientID]
,[ApptDate];
The idea is pretty simple - I want separate the records in group (30 days), in which group the smallest record is new, the others are follow ups. Check how the statement is built:
SELECT *
,DATEDIFF(DAY, MIN([ApptDate]) OVER (PARTITION BY [PatientID]), [ApptDate])
,DATEDIFF(DAY, MIN([ApptDate]) OVER (PARTITION BY [PatientID]), [ApptDate]) * 1.0 / 30
,CEILING(DATEDIFF(DAY, MIN([ApptDate]) OVER (PARTITION BY [PatientID]), [ApptDate]) * 1.0 / 30 + 0.000001)
FROM #Appt1
ORDER BY [PatientID]
,[ApptDate];
So:
first, we are getting the first date, for each group and calculating the differences in days with the current one
then, we are want to get groups - * 1.0 / 30 is added
as for 30, 60, 90, etc days we are getting whole number and we wanted to start a new period, I have added + 0.000001; also, we are using ceiling function to get the smallest integer greater than, or equal to, the specified numeric expression
That's it. Having such group we simply use ROW_NUMBER to find our start date and make it as new and leaving the rest as follow ups.

With due respect to everybody and in IMHO,
There is not much difference between While LOOP and Recursive CTE in terms of RBAR
There is not much performance gain when using Recursive CTE and Window Partition function all in one.
Appid should be int identity(1,1) , or it should be ever increasing clustered index.
Apart from other benefit it also ensure that all successive row APPDate of that patient must be greater.
This way you can easily play with APPID in your query which will be more efficient than putting inequality operator like >,< in APPDate.
Putting inequality operator like >,< in APPID will aid Sql Optimizer.
Also there should be two date column in table like
APPDateTime datetime2(0) not null,
Appdate date not null
As these are most important columns in most important table,so not much cast ,convert.
So Non clustered index can be created on Appdate
Create NonClustered index ix_PID_AppDate_App on APP (patientid,APPDate) include(other column which is not i predicate except APPID)
Test my script with other sample data and lemme know for which sample data it not working.
Even if it do not work then I am sure it can be fix in my script logic itself.
CREATE TABLE #Appt1 (ApptID INT, PatientID INT, ApptDate DATE)
INSERT INTO #Appt1
SELECT 1,101,'2020-01-05' UNION ALL
SELECT 2,505,'2020-01-06' UNION ALL
SELECT 3,505,'2020-01-10' UNION ALL
SELECT 4,505,'2020-01-20' UNION ALL
SELECT 5,101,'2020-01-25' UNION ALL
SELECT 6,101,'2020-02-12' UNION ALL
SELECT 7,101,'2020-02-20' UNION ALL
SELECT 8,101,'2020-03-30' UNION ALL
SELECT 9,303,'2020-01-28' UNION ALL
SELECT 10,303,'2020-02-02'
;With CTE as
(
select a1.* ,a2.ApptDate as NewApptDate
from #Appt1 a1
outer apply(select top 1 a2.ApptID ,a2.ApptDate
from #Appt1 A2
where a1.PatientID=a2.PatientID and a1.ApptID>a2.ApptID
and DATEDIFF(day,a2.ApptDate, a1.ApptDate)>30
order by a2.ApptID desc )A2
)
,CTE1 as
(
select a1.*, a2.ApptDate as FollowApptDate
from CTE A1
outer apply(select top 1 a2.ApptID ,a2.ApptDate
from #Appt1 A2
where a1.PatientID=a2.PatientID and a1.ApptID>a2.ApptID
and DATEDIFF(day,a2.ApptDate, a1.ApptDate)<=30
order by a2.ApptID desc )A2
)
select *
,case when FollowApptDate is null then 'New'
when NewApptDate is not null and FollowApptDate is not null
and DATEDIFF(day,NewApptDate, FollowApptDate)<=30 then 'New'
else 'Followup' end
as Category
from cte1 a1
order by a1.PatientID
drop table #Appt1

Although it's not clearly addressed in the question, it's easy to figure out that the appointment dates cannot be simply categorized by 30-day groups. It makes no business sense. And you cannot use the appt id either. One can make a new appointment today for 2020-09-06.
Here is how I address this issue. First, get the first appointment, then calculate the date difference between each appointment and the first appt. If it's 0, set to 'New'. If <= 30 'Followup'. If > 30, set as 'Undecided' and do the next round check until there is no more 'Undecided'. And for that, you really need a while loop, but it does not loop through each appointment date, rather only a few datasets. I checked the execution plan. Even though there are only 10 rows, the query cost is significantly lower than that using recursive CTE, but not as low as Lukasz Szozda's addendum method.
IF OBJECT_ID('tempdb..#TEMPTABLE') IS NOT NULL DROP TABLE #TEMPTABLE
SELECT ApptID, PatientID, ApptDate
,CASE WHEN (DATEDIFF(DAY, MIN(ApptDate) OVER (PARTITION BY PatientID), ApptDate) = 0) THEN 'New'
WHEN (DATEDIFF(DAY, MIN(ApptDate) OVER (PARTITION BY PatientID), ApptDate) <= 30) THEN 'Followup'
ELSE 'Undecided' END AS Category
INTO #TEMPTABLE
FROM #Appt1
WHILE EXISTS(SELECT TOP 1 * FROM #TEMPTABLE WHERE Category = 'Undecided') BEGIN
;WITH CTE AS (
SELECT ApptID, PatientID, ApptDate
,CASE WHEN (DATEDIFF(DAY, MIN(ApptDate) OVER (PARTITION BY PatientID), ApptDate) = 0) THEN 'New'
WHEN (DATEDIFF(DAY, MIN(ApptDate) OVER (PARTITION BY PatientID), ApptDate) <= 30) THEN 'Followup'
ELSE 'Undecided' END AS Category
FROM #TEMPTABLE
WHERE Category = 'Undecided'
)
UPDATE #TEMPTABLE
SET Category = CTE.Category
FROM #TEMPTABLE t
LEFT JOIN CTE ON CTE.ApptID = t.ApptID
WHERE t.Category = 'Undecided'
END
SELECT ApptID, PatientID, ApptDate, Category
FROM #TEMPTABLE

I hope this will help you.
WITH CTE AS
(
SELECT #Appt1.*, RowNum = ROW_NUMBER() OVER (PARTITION BY PatientID ORDER BY ApptDate, ApptID) FROM #Appt1
)
SELECT A.ApptID , A.PatientID , A.ApptDate ,
Expected_Category = CASE WHEN (DATEDIFF(MONTH, B.ApptDate, A.ApptDate) > 0) THEN 'New'
WHEN (DATEDIFF(DAY, B.ApptDate, A.ApptDate) <= 30) then 'Followup'
ELSE 'New' END
FROM CTE A
LEFT OUTER JOIN CTE B on A.PatientID = B.PatientID
AND A.rownum = B.rownum + 1
ORDER BY A.PatientID, A.ApptDate

You could use a Case statement.
select
*,
CASE
WHEN DATEDIFF(d,A1.ApptDate,A2.ApptDate)>30 THEN 'New'
ELSE 'FollowUp'
END 'Category'
from
(SELECT PatientId, MIN(ApptId) 'ApptId', MIN(ApptDate) 'ApptDate' FROM #Appt1 GROUP BY PatientID) A1,
#Appt1 A2
where
A1.PatientID=A2.PatientID AND A1.ApptID<A2.ApptID
The question is, should this category be assigned based off the initial appointment, or the one prior? That is, if a Patient has had three appointments, should we compare the third appointment to the first, or the second?
You problem states the first, which is how I've answered. If that's not the case, you'll want to use lag.
Also, keep in mind that DateDiff makes not exception for weekends. If this should be weekdays only, you'll need to create your own Scalar-Valued function.

using Lag function
select apptID, PatientID , Apptdate ,
case when date_diff IS NULL THEN 'NEW'
when date_diff < 30 and (date_diff_2 IS NULL or date_diff_2 < 30) THEN 'Follow Up'
ELSE 'NEW'
END AS STATUS FROM
(
select
apptID, PatientID , Apptdate ,
DATEDIFF (day,lag(Apptdate) over (PARTITION BY PatientID order by ApptID asc),Apptdate) date_diff ,
DATEDIFF(day,lag(Apptdate,2) over (PARTITION BY PatientID order by ApptID asc),Apptdate) date_diff_2
from #Appt1
) SRC
Demo --> https://rextester.com/TNW43808

with cte
as
(
select
tmp.*,
IsNull(Lag(ApptDate) Over (partition by PatientID Order by PatientID,ApptDate),ApptDate) PriorApptDate
from #Appt1 tmp
)
select
PatientID,
ApptDate,
PriorApptDate,
DateDiff(d,PriorApptDate,ApptDate) Elapsed,
Case when DateDiff(d,PriorApptDate,ApptDate)>30
or DateDiff(d,PriorApptDate,ApptDate)=0 then 'New' else 'Followup' end Category from cte
Mine is correct. The authors was incorrect, see elapsed

Selecting 1 row per day closest to 4am? [duplicate]

This question already has answers here:
Get top 1 row of each group
(19 answers)
Closed 6 years ago.
We're currently working on a query for a report that returns a series of data. The customer has specified that they want to receive 5 rows total, with the data from the previous 5 days (as defined by a start date and an end date variable). For each day, they want the data from the row that's closest to 4am.
I managed to get it to work for a single day, but I certainly don't want to union 5 separate select statements simply to fetch these values. Is there any way to accomplish this via CTEs?
select top 1
'W' as [RecordType]
, [WellIdentifier] as [ProductionPtID]
, t.Name as [Device Name]
, t.RecordDate --convert(varchar, t.RecordDate, 112) as [RecordDate]
, TubingPressure as [Tubing Pressure]
, CasingPressure as [Casing Pressure]
from #tTempData t
Where cast (t.recorddate as time) = '04:00:00.000'
or datediff (hh,'04:00:00.000',cast (t.recorddate as time)) < -1.2
order by Name, RecordDate desc

assuming that the #tTempData only contains the previous 5 days records
SELECT *
FROM
(
SELECT *, rn = row_number() over
(
partition by convert(date, recorddate)
order by ABS ( datediff(minute, convert(time, recorddate) , '04:00' )
)
FROM #tTempData
)
WHERE rn = 1

You can use row_number() like this to get the top 5 last days most closest to 04:00
SELECT TOP 5 * FROM (
select t.* ,
ROW_NUMBER() OVER(PARTITION BY t.recorddate
ORDER BY abs(datediff (minute,'04:00:00.000',cast (t.recorddate as time))) rnk
from #tTempData t)
WHERE rnk = 1
ORDER BY recorddate DESC

You can use row_number() for this purpose:
select t.*
from (select t.*,
row_number() over (partition by cast(t.recorddate as date)
order by abs(datediff(ms, '04:00:00.000',
cast(t.recorddate as time)
))
) seqnum
from #tTempData t
) t
where seqnum = 1;
You can add an appropriate where clause in the subquery to get the dates that you are interested in.

Try something like this:
select
'W' as [RecordType]
, [WellIdentifier] as [ProductionPtID]
, t.Name as [Device Name]
, t.RecordDate --convert(varchar, t.RecordDate, 112) as [RecordDate]
, TubingPressure as [Tubing Pressure]
, CasingPressure as [Casing Pressure]
from #tTempData t
Where exists
(select 1 from #tTempData t1 where
ABS(datediff (hh,'04:00:00.000',cast (t.recorddate as time))) <
ABS(datediff (hh,'04:00:00.000',cast (t1.recorddate as time)))
and GETDATE(t.RecordDate) = GETDATE(t1.RecordDate)
)dt
and t.RecordDate between YOURDATERANGE
order by Name, RecordDate desc;

SQL - Replace repeated rows with null values while preserving number of rows

I am trying to get only one instance of a year instead of 12 because I am using this column in a lookup table to provide parameters to a report. Because I am using both monthly and yearly data, I am trying to get them both in the same table.
I have a table like this
--Date--------Year
--------------------
1/2012-------2012
2/2012-------2012
3/2012-------2012
4/2012-------2012
5/2012-------2012
6/2012-------2012
7/2012-------2012
8/2012-------2012
9/2012-------2012
10/2012------2012
11/2012------2012
12/2012------2012
1/2013-------2013
2/2013-------2013
And this is my desired table
--Date--------Year
--------------------
1/2012-------2012
2/2012-------null
3/2012-------null
4/2012-------null
5/2012-------null
6/2012-------null
7/2012-------null
8/2012-------null
9/2012-------null
10/2012------null
11/2012------null
12/2012------null
1/2013-------2013
2/2013-------null
Can someone give me an idea of how to solve a problem like this?
The code I am using right now is
SELECT CAST(MONTH(rmp.EcoDate) AS Varchar(2)) + '/' + CAST(YEAR(rmp.EcoDate) AS varchar(4)) AS Date, Year(rmp.EcoDate) as EcoYear
FROM PhdRpt.ReportCaseList_542 AS rcl INNER JOIN
CaseCases AS cc ON rcl.CaseCaseId = cc.CaseCaseId INNER JOIN
PhdRpt.RptMonthlyProduction_542 AS rmp ON rcl.ReportRunCaseId = rmp.ReportRunCaseId`
GROUP BY rmp.EcoDate

You can do this by enumerating the rows within a year. Then update all but the first:
with toupdate as (
select t.*, row_number() over (partition by [year] order by [date]) as seqnum
from t
)
update toupdate
set [year] = NULL
where seqnum > 1;
If you want this as a select statement:
with ts as (
select t.*, row_number() over (partition by [year] order by [date]) as seqnum
from t
)
select [date],
(case when seqnum = 1 then [year] end) as [year]
from ts;

Calculate Average time spend based on a change in location zone

I have a table similar to
create table LOCHIST
(
RES_ID VARCHAR(10) NOT NULL,
LOC_DATE TIMESTAMP NOT NULL,
LOC_ZONE VARCHAR(10)
)
with values such as
insert into LOCHIST values(0911,2015-09-23 12:27:00.000000,SYLVSYLGA);
insert into LOCHIST values(5468,2013-02-15 13:13:24.000000,30726);
insert into LOCHIST values(23894,2013-02-15 13:12:13.000000,BECTFOUNC);
insert into LOCHIST values(24119,2013-02-15 13:12:09.000000,30363);
insert into LOCHIST values(7101,2013-02-15 13:11:37.000000,37711);
insert into LOCHIST values(26083,2013-02-15 13:11:36.000000,SHAWANDAL);
insert into LOCHIST values(24978,2013-02-15 13:11:36.000000,38132);
insert into LOCHIST values(26696,2013-02-15 13:11:27.000000,29583);
insert into LOCHIST values(5468,2013-02-15 13:11:00.000000,37760);
insert into LOCHIST values(5552,2013-02-15 13:10:55.000000,30090);
insert into LOCHIST values(24932,2013-02-15 13:10:48.000000,JBTTLITGA);
insert into LOCHIST values(23894,2013-02-15 13:10:42.000000,47263);
insert into LOCHIST values(26803,2013-02-15 13:10:25.000000,32534);
insert into LOCHIST values(24434,2013-02-15 13:10:03.000000,PLANSUFVA);
insert into LOCHIST values(26696,2013-02-15 13:10:00.000000,GEORALBGA);
insert into LOCHIST values(5468,2013-02-15 13:09:54.000000,19507);
insert into LOCHIST values(23894,2013-02-15 13:09:48.000000,37725);
This table literally goes on for millions of records.
Each RES_ID represents the ID of a trailer who pings their location to a LOC_ZONE which is then stored at the time in LOC_DATE.
What I am trying to find, is the average amount of time spent for all trailers in a specific location zone. For example, if trailer x spent 4 hours in in loc zone PLANSUFVA, and trailer y spent 6 hours in loc zone PLANSUFVA I would want to return
Loc Zone Avg Time
PLANSUFVA 5
Is there anyway to do this without cursors?
I really appreciate your help.

This needs SQL 2012:
with data
as (
select *, (case when LOC_ZONE != PREVIOUS_LOC_ZONE or PREVIOUS_LOC_ZONE is null then ROW_ID else null end) as STAY_START, (case when LOC_ZONE != NEXT_LOC_ZONE or NEXT_LOC_ZONE is null then ROW_ID else null end) as STAY_END
from (
select RES_ID, LOC_ZONE, LOC_DATE, lead(LOC_DATE, 1) over (partition by RES_ID, LOC_ZONE order by LOC_DATE) as NEXT_LOC_DATE, lag(LOC_ZONE, 1) over (partition by RES_ID order by LOC_DATE) as PREVIOUS_LOC_ZONE, lead(LOC_ZONE, 1) over (partition by RES_ID order by LOC_DATE) as NEXT_LOC_ZONE, ROW_NUMBER() over (order by RES_ID, LOC_ZONE, LOC_DATE) as ROW_ID
from LOCHIST
) t
), stays as (
select * from (
select RES_ID, LOC_ZONE, STAY_START, lead(STAY_END, 1) over (order by ROWID) as STAY_END
from (
select RES_ID, LOC_ZONE, STAY_START, STAY_END, ROW_NUMBER() over (order by RES_ID, LOC_ZONE, STAY_START desc) as ROWID
from data
where STAY_START is not null or STAY_END is not null
) t
) t
where STAY_START is not null and STAY_END is not null
)
select s.LOC_ZONE, avg(datediff(second, LOC_DATE, NEXT_LOC_DATE)) / 60 / 60 as AVG_IN_HOURS
from data d
inner join stays s on d.RES_ID = s.RES_ID and d.LOC_ZONE = s.LOC_ZONE and d.ROW_ID >= s.STAY_START and d.ROW_ID < s.STAY_END
group by s.LOC_ZONE

To solve this problem, you need the amount of time spent at each location.
One way to do this is with a correlated subquery. You need to group adjacent values. The idea is to find the next value in the sequence:
select resid, min(loc_zone) as loc_zone, min(loc_date) as StartTime,
max(loc_date) as EndTime,
nextdate as NextStartTime
from (select lh.*,
(select min(loc_date) from lochist lh2
where lh2.res_id = lh.res_id and lh2.loc_zone <> lh.loc_zone and
lh2.loc_date > lh.loc_date
) as nextdate
from lochist lh
) lh
group by lh.res_id, nextdate
With this data, you can then get the average that you want.
I am not clear if the time should be based on EndTime - StartTime (last recorded time at the location minus the first recorded time) or NextStartTime - startTime (first time at next location minus first time at this location).
Also, this returns NULL for the last location for each res_id. You don't say what to do about the last in the sequence.
If you build an index on res_id, loc_date, loc_zone, it might run faster.
If you had Oracle or SQL Server 2012, the right query is:
select lh.*,
lead(loc_date) over (partition by res_id order by loc_date) as nextdate
from (select lh.*,
lag(loc_zone) over (partition by res_id order by loc_date) as prevzone
from lochist lh
) lh
where prevzone is null or prevzone <> loc_zone
Now you have one row per stay and nextdate is the date at the next zone.

This should get you each zone ordered by the average number of minutes spent in it. The CROSS APPLY returns the next ping in a different zone.
SELECT
loc.LOC_ZONE
,AVG(DATEDIFF(mi,loc.LOC_DATE,nextPing.LOC_DATE)) AS avgMinutes
FROM LOCHIST loc
CROSS APPLY(
SELECT TOP 1 loc2.LOC_DATE
FROM LOCHIST loc2
WHERE loc2.RES_ID = loc.RES_ID
AND loc2.LOC_DATE > loc.LOC_DATE
AND loc2.LOC_ZONE <> loc.LOC_ZONE
ORDER BY loc2.LOC_DATE ASC
) AS nextPing
GROUP BY loc.LOC_ZONE
ORDER BY avgMinutes DESC

My variation of the solution:
select LOC_ZONE, avg(TOTAL_TIME) AVG_TIME from (
select RES_ID, LOC_ZONE, sum(TIME_SPENT) TOTAL_TIME
from (
select RES_ID, LOC_ZONE, datediff(mi, lag(LOC_DATE, 1) over (
partition by RES_ID order by LOC_DATE), LOC_DATE) TIME_SPENT
from LOCHIST
) t
where TIME_SPENT is not null
group by RES_ID, LOC_ZONE) f
group by LOC_ZONE
This accounts for multiple stays at the same location. The choice between lag or lead depends if a stay should start or end with the ping (ie, if one trailer sends a ping from A and then x hours later from B, does that count for A or B).

To do this without using either a cursor or a correlated subquery, try:
with rl as
(select l.*, rank() over (partition by res_id order by loc_date) rn
from lochist l),
fdr as
(select rc.*, coalesce(rn.loc_date, getdate()) next_date
from rl rc
left join rl rn on rc.res_id = rn.res_id and rc.rn + 1 = rn.rn)
select loc_zone, avg(datediff(second, loc_date, next_date))/3600 avg_time
from fdr
group by loc_zone
SQLFiddle here.
(Because of the way that SQLServer calculates time differences, it's probably better to calculate the average time in seconds and then divide by 60*60. With the exception of the getdate() and datediff clauses - which can be replaced by sysdate and next_date - loc_date - this should work in both SQLServer 2005 onwards and Oracle 10g onwards.)

SQL Question: Getting Datediff in days elapsed for each record in a group

Given this table:
How can I get the datediff in days between each status_date for each group of ID_Number? In other words I need to find the number of elapsed days for each status that the ID_Number has been given.
Some things to know:
All ID_Number will have a received_date which should be the earliest date for each ID_Number (but app doesn't enforce)
For each ID_Number there will be a status with a corresponding status_date which is the date that the ID_Number was given that particular status.
The status column doesn't always necessarily go in the same order every time (app doesn't enforce)
All ID_Number will have a closed_date which should be the latest date (but app doesn't enforce)
Sample output:
So for ID_Number 2001, the first date (received_date) is 2009-05-02 and the next date you encounter has a status of 'open' and is 2009-05-02 so elapsed days is 0. Moving on to the next date encountered is 2009-05-10 with a status of 'invest' and the elapsed days is 8 counting from the prior date. The next date encountered is 2009-07-11 and the elapsed days is 62 counting from the previous date.
Edited to add:
Is it possible to have the elapsed days end up as a column on this table/view?
I also forgot to add that this is SQL Server 2000.

What I understand is that you need the difference between the first status_date and the next status_date for the same id and so on up to the closed_date.
This will only work in SQL 2005 and up.
;with test as (
select
key,
id_number,
status,
received_date,
status_date,
closed_date,
row_number() over (partition by id order by status_date, key ) as rownum
from #test
)
select
t1.key,
t1.id_number,
t1.status,
t1.status_date,
t1.received_date,
t1.closed_date,
datediff(d, case when t1.rownum = 1
then t1.received_date
else
case when t2.status_date is null
then t1.closed_date
else t2.status_date
end
end,
t1.status_date
) as days
from test t1
left outer join test t2
on t1.id = t2.id
and t2.rownum = t1.rownum - 1
This solution will work with SQL 2000 but I am not sure how good will perform:
select *,
datediff(d,
case when prev_date is null
then closed_date
else prev_date
end,
status_date )
from (
select *,
isnull( ( select top 1 t2.status_date
from #test t2
where t1.id_number = t2.id_number
and t2.status_date < t1.status_date
order by t2.status_date desc
),received_date) as prev_date
from #test t1
) a
order by id_number, status_date
Note: Replace the #Test table with the name of your table.

Some sample output would really help, but this is a guess at what you mean, assuming you want that information for each ID_Number/Status combination:
select ID_Number, Status, EndDate - StartDate as DaysElapsed
from (
select ID_Number, Status, min(coalesce(received_date, status_date)) as StartDate, max(coalesce(closed_date, status_date)) as EndDate
from Table1
group by ID_Number, Status
) a

The tricky bit is determining the previous status and putting it on the same row as the current status. It would be simplified a little if there were a correlation between Key and StatusDate (i.e. that Key(x) > Key(y) always implies StatusDate(x) >= StatusDate(y)). Unfortunately, that doesn't seem to be the case.
PS: I am assuming Key is a unique identifier on your table; you haven't said anything to indicate otherwise.
SELECT Key,
ID_Number,
(
SELECT TOP 1 Key
FROM StatusUpdates prev
WHERE (prev.ID_Number = cur.ID_Number)
AND ( (prev.StatusDate < cur.StatusDate)
OR ( prev.StatusDate = cur.StatusDate
AND prev.Key < cur.Key
)
)
ORDER BY StatusDate, Key /*Consider index on (ID_Number, StatusDate, Key)*/
) PrevKey
FROM StatusUpdates cur
Once you have this as a basis, it's easy to extrapolate to any other info you need from the current or previous StatusUpdate. E.g.
SELECT c.*,
p.Status AS PrevStatus,
p.StatusDate AS PrevStatusDate,
DATEDIFF(d, c.StatusDate, p.StatusDate) AS DaysElapsed
FROM (
SELECT Key,
ID_Number,
Status,
SattusDate,
(
SELECT TOP 1 Key
FROM StatusUpdates prev
WHERE (prev.ID_Number = cur.ID_Number)
AND ( (prev.StatusDate < cur.StatusDate)
OR ( prev.StatusDate = cur.StatusDate
AND prev.Key < cur.Key
)
)
ORDER BY StatusDate, Key
) PrevKey
FROM StatusUpdates cur
) c
JOIN StatusUpdates p ON
p.Key = c.PrevKey

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

SQL update with a LAG - sql

I think the simplest way to update the whole table at once would be something like this with rn_cte as ( select * , row_number() over (partition by patient order by (select null)) rn from tTable) update tTable set start=NULl where rn>=2;

Related

Query without WHILE Loop

Selecting 1 row per day closest to 4am? [duplicate]

SQL - Replace repeated rows with null values while preserving number of rows

Calculate Average time spend based on a change in location zone

SQL Question: Getting Datediff in days elapsed for each record in a group

Categories

Resources