Calculate Average time spend based on a change in location zone - sql

I have a table similar to
create table LOCHIST
(
RES_ID VARCHAR(10) NOT NULL,
LOC_DATE TIMESTAMP NOT NULL,
LOC_ZONE VARCHAR(10)
)
with values such as
insert into LOCHIST values(0911,2015-09-23 12:27:00.000000,SYLVSYLGA);
insert into LOCHIST values(5468,2013-02-15 13:13:24.000000,30726);
insert into LOCHIST values(23894,2013-02-15 13:12:13.000000,BECTFOUNC);
insert into LOCHIST values(24119,2013-02-15 13:12:09.000000,30363);
insert into LOCHIST values(7101,2013-02-15 13:11:37.000000,37711);
insert into LOCHIST values(26083,2013-02-15 13:11:36.000000,SHAWANDAL);
insert into LOCHIST values(24978,2013-02-15 13:11:36.000000,38132);
insert into LOCHIST values(26696,2013-02-15 13:11:27.000000,29583);
insert into LOCHIST values(5468,2013-02-15 13:11:00.000000,37760);
insert into LOCHIST values(5552,2013-02-15 13:10:55.000000,30090);
insert into LOCHIST values(24932,2013-02-15 13:10:48.000000,JBTTLITGA);
insert into LOCHIST values(23894,2013-02-15 13:10:42.000000,47263);
insert into LOCHIST values(26803,2013-02-15 13:10:25.000000,32534);
insert into LOCHIST values(24434,2013-02-15 13:10:03.000000,PLANSUFVA);
insert into LOCHIST values(26696,2013-02-15 13:10:00.000000,GEORALBGA);
insert into LOCHIST values(5468,2013-02-15 13:09:54.000000,19507);
insert into LOCHIST values(23894,2013-02-15 13:09:48.000000,37725);
This table literally goes on for millions of records.
Each RES_ID represents the ID of a trailer who pings their location to a LOC_ZONE which is then stored at the time in LOC_DATE.
What I am trying to find, is the average amount of time spent for all trailers in a specific location zone. For example, if trailer x spent 4 hours in in loc zone PLANSUFVA, and trailer y spent 6 hours in loc zone PLANSUFVA I would want to return
Loc Zone Avg Time
PLANSUFVA 5
Is there anyway to do this without cursors?
I really appreciate your help.

This needs SQL 2012:
with data
as (
select *, (case when LOC_ZONE != PREVIOUS_LOC_ZONE or PREVIOUS_LOC_ZONE is null then ROW_ID else null end) as STAY_START, (case when LOC_ZONE != NEXT_LOC_ZONE or NEXT_LOC_ZONE is null then ROW_ID else null end) as STAY_END
from (
select RES_ID, LOC_ZONE, LOC_DATE, lead(LOC_DATE, 1) over (partition by RES_ID, LOC_ZONE order by LOC_DATE) as NEXT_LOC_DATE, lag(LOC_ZONE, 1) over (partition by RES_ID order by LOC_DATE) as PREVIOUS_LOC_ZONE, lead(LOC_ZONE, 1) over (partition by RES_ID order by LOC_DATE) as NEXT_LOC_ZONE, ROW_NUMBER() over (order by RES_ID, LOC_ZONE, LOC_DATE) as ROW_ID
from LOCHIST
) t
), stays as (
select * from (
select RES_ID, LOC_ZONE, STAY_START, lead(STAY_END, 1) over (order by ROWID) as STAY_END
from (
select RES_ID, LOC_ZONE, STAY_START, STAY_END, ROW_NUMBER() over (order by RES_ID, LOC_ZONE, STAY_START desc) as ROWID
from data
where STAY_START is not null or STAY_END is not null
) t
) t
where STAY_START is not null and STAY_END is not null
)
select s.LOC_ZONE, avg(datediff(second, LOC_DATE, NEXT_LOC_DATE)) / 60 / 60 as AVG_IN_HOURS
from data d
inner join stays s on d.RES_ID = s.RES_ID and d.LOC_ZONE = s.LOC_ZONE and d.ROW_ID >= s.STAY_START and d.ROW_ID < s.STAY_END
group by s.LOC_ZONE

To solve this problem, you need the amount of time spent at each location.
One way to do this is with a correlated subquery. You need to group adjacent values. The idea is to find the next value in the sequence:
select resid, min(loc_zone) as loc_zone, min(loc_date) as StartTime,
max(loc_date) as EndTime,
nextdate as NextStartTime
from (select lh.*,
(select min(loc_date) from lochist lh2
where lh2.res_id = lh.res_id and lh2.loc_zone <> lh.loc_zone and
lh2.loc_date > lh.loc_date
) as nextdate
from lochist lh
) lh
group by lh.res_id, nextdate
With this data, you can then get the average that you want.
I am not clear if the time should be based on EndTime - StartTime (last recorded time at the location minus the first recorded time) or NextStartTime - startTime (first time at next location minus first time at this location).
Also, this returns NULL for the last location for each res_id. You don't say what to do about the last in the sequence.
If you build an index on res_id, loc_date, loc_zone, it might run faster.
If you had Oracle or SQL Server 2012, the right query is:
select lh.*,
lead(loc_date) over (partition by res_id order by loc_date) as nextdate
from (select lh.*,
lag(loc_zone) over (partition by res_id order by loc_date) as prevzone
from lochist lh
) lh
where prevzone is null or prevzone <> loc_zone
Now you have one row per stay and nextdate is the date at the next zone.

This should get you each zone ordered by the average number of minutes spent in it. The CROSS APPLY returns the next ping in a different zone.
SELECT
loc.LOC_ZONE
,AVG(DATEDIFF(mi,loc.LOC_DATE,nextPing.LOC_DATE)) AS avgMinutes
FROM LOCHIST loc
CROSS APPLY(
SELECT TOP 1 loc2.LOC_DATE
FROM LOCHIST loc2
WHERE loc2.RES_ID = loc.RES_ID
AND loc2.LOC_DATE > loc.LOC_DATE
AND loc2.LOC_ZONE <> loc.LOC_ZONE
ORDER BY loc2.LOC_DATE ASC
) AS nextPing
GROUP BY loc.LOC_ZONE
ORDER BY avgMinutes DESC

My variation of the solution:
select LOC_ZONE, avg(TOTAL_TIME) AVG_TIME from (
select RES_ID, LOC_ZONE, sum(TIME_SPENT) TOTAL_TIME
from (
select RES_ID, LOC_ZONE, datediff(mi, lag(LOC_DATE, 1) over (
partition by RES_ID order by LOC_DATE), LOC_DATE) TIME_SPENT
from LOCHIST
) t
where TIME_SPENT is not null
group by RES_ID, LOC_ZONE) f
group by LOC_ZONE
This accounts for multiple stays at the same location. The choice between lag or lead depends if a stay should start or end with the ping (ie, if one trailer sends a ping from A and then x hours later from B, does that count for A or B).

To do this without using either a cursor or a correlated subquery, try:
with rl as
(select l.*, rank() over (partition by res_id order by loc_date) rn
from lochist l),
fdr as
(select rc.*, coalesce(rn.loc_date, getdate()) next_date
from rl rc
left join rl rn on rc.res_id = rn.res_id and rc.rn + 1 = rn.rn)
select loc_zone, avg(datediff(second, loc_date, next_date))/3600 avg_time
from fdr
group by loc_zone
SQLFiddle here.
(Because of the way that SQLServer calculates time differences, it's probably better to calculate the average time in seconds and then divide by 60*60. With the exception of the getdate() and datediff clauses - which can be replaced by sysdate and next_date - loc_date - this should work in both SQLServer 2005 onwards and Oracle 10g onwards.)

Related

Collapse multiple rows into a single row based upon a break condition

I have a simple sounding requirement that has had me stumped for a day or so now, so its time to seek help from the experts.
My requirement is to simply roll-up multiple rows into a single row based upon a break condition - when any of these columns change Employee ID, Allowance Plan, Allowance Amount or To Date, then the row is to be kept, if that makes sense.
An example source data set is shown below:
and the target data after collapsing the rows should look like this:
As you can see I don't need any type of running totals calculating I just need to collapse the rows into a single record per from date/to date combination.
So far I have tried the following SQL using a GROUP BY and MIN function
select [Employee ID], [Allowance Plan],
min([From Date]), max([To Date]), [Allowance Amount]
from [dbo].[#AllowInfo]
group by [Employee ID], [Allowance Plan], [Allowance Amount]
but that just gives me a single row and does not take into account the break condition.
what do I need to do so that the records are rolled-up (correct me if that is not the right terminology) correctly taking into account the break condition?
Any help is appreciated.
Thank you.
Note that your test data does not really exercise the algo that well - e.g. you only have one employee, one plan. Also, as you described it, you would end up with 4 rows as there is a change of todate between 7->8, 8->9, 9->10 and 10->11.
But I can see what you are trying to do, so this should at least get you on the right track, and returns the expected 3 rows. I have taken the end of a group to be where either employee/plan/amount has changed, or where todate is not null (or where we reach the end of the data)
CREATE TABLE #data
(
RowID INT,
EmployeeID INT,
AllowancePlan VARCHAR(30),
FromDate DATE,
ToDate DATE,
AllowanceAmount DECIMAL(12,2)
);
INSERT INTO #data(RowID, EmployeeID, AllowancePlan, FromDate, ToDate, AllowanceAmount)
VALUES
(1,200690,'CarAllowance','30/03/2017', NULL, 1000.0),
(2,200690,'CarAllowance','01/08/2017', NULL, 1000.0),
(6,200690,'CarAllowance','23/04/2018', NULL, 1000.0),
(7,200690,'CarAllowance','30/03/2018', NULL, 1000.0),
(8,200690,'CarAllowance','21/06/2018', '01/04/2019', 1000.0),
(9,200690,'CarAllowance','04/11/2021', NULL, 1000.0),
(10,200690,'CarAllowance','30/03/2017', '13/05/2022', 1000.0),
(11,200690,'CarAllowance','14/05/2022', NULL, 850.0);
-- find where the break points are
WITH chg AS
(
SELECT *,
CASE WHEN LAG(EmployeeID, 1, -1) OVER(ORDER BY RowID) != EmployeeID
OR LAG(AllowancePlan, 1, 'X') OVER(ORDER BY RowID) != AllowancePlan
OR LAG(AllowanceAmount, 1, -1) OVER(ORDER BY RowID) != AllowanceAmount
OR LAG(ToDate, 1) OVER(ORDER BY RowID) IS NOT NULL
THEN 1 ELSE 0 END AS NewGroup
FROM #data
),
-- count the number of break points as we go to group the related rows
grp AS
(
SELECT chg.*,
ISNULL(
SUM(NewGroup)
OVER (ORDER BY RowID
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW),
0) AS grpNum
FROM chg
)
SELECT MIN(grp.RowID) AS RowID,
MAX(grp.EmployeeID) AS EmployeeID,
MAX(grp.AllowancePlan) AS AllowancePlan,
MIN(grp.FromDate) AS FromDate,
MAX(grp.ToDate) AS ToDate,
MAX(grp.AllowanceAmount) AS AllowanceAmount
FROM grp
GROUP BY grpNum
one way is to get all rows the last todate, and then group on that
select min(t.RowID) as RowID,
t.EmployeeID,
min(t.AllowancePlan) as AllowancePlan,
min(t.FromDate) as FromDate,
max(t.ToDate) as ToDate,
min(t.AllowanceAmount) as AllowanceAmount
from ( select t.RowID,
t.EmployeeID,
t.FromDate,
t.AllowancePlan,
t.AllowanceAmount,
case when t.ToDate is null then ( select top 1 t2.ToDate
from test t2
where t2.EmployeeID = t.EmployeeID
and t2.ToDate is not null
and t2.FromDate > t.FromDate -- t2.RowID > t.RowID
order by t2.RowID, t2.FromDate
)
else t.ToDate
end as todate
from test t
) t
group by t.EmployeeID, t.ToDate
order by t.EmployeeID, min(t.RowID)
See and test yourself in this DBFiddle
the result is
RowID
EmployeeID
AllowancePlan
FromDate
ToDate
AllowanceAmount
1
200690
CarAllowance
2017-03-30
2019-04-01
1000
9
200690
CarAllowance
2021-11-04
2022-05-13
1000
11
200690
CarAllowance
2022-05-14
(null)
850

SQL update with a LAG

I have the following update that updates a record based on a prior record -
WITH CTE AS(
SELECT
patient,
start,
CASE
WHEN
ISNULL(start, '') = ''
AND cd = '3'
AND LAG([thru_dt]) OVER (PARTITION BY [patient] ORDER BY cast(claimno AS int) ASC) BETWEEN CONVERT(VARCHAR, DATEADD(DAY, -30, thru_dt), 112) AND thru_dt
AND LAG([cd]) OVER (PARTITION BY [patient] ORDER BY cast(claimno AS int) ASC) = '30'
THEN LAG([start]) OVER (PARTITION BY [patient] ORDER BY cast(claimno AS int) ASC)
WHEN ISNULL(hhstrtdt, '') = '' AND
ROW_NUMBER() OVER(PARTITION BY patient ORDER BY cast(claimno AS int) DESC) = 1
AND LAG([cd]) OVER (PARTITION BY [patient] ORDER BY cast(claimno AS int) ASC) = '30'
AND LAG([thru_dt]) OVER (PARTITION BY [patient] ORDER BY cast(claimno AS int) ASC) BETWEEN CONVERT(VARCHAR, DATEADD(DAY, -30, thru_dt), 112) AND thru_dt
THEN LAG([start]) OVER (PARTITION BY [patient] ORDER BY cast(claimno AS int) ASC)
ELSE start
END AS new_start
FROM table
)
UPDATE CTE
SET start = new_start
This query only updates one record at a time. For example, if I had this input -
start patient cd
20190307 497863693 30
NULL 497863693 30
NULL 497863693 30
NULL 497863693 30
To update, all 3 rows with a NULL start with the value of the first row, I would have to run the query 3 times.
The output would be -
start patient cd
20190307 497863693 30
20190307 497863693 30
20190307 497863693 30
20190307 497863693 30
Is there a way to get the query to update all rows for the above patient instead of it doing it one by one? I could keep a count of the NULLs in the table and make the update stop running once the no.of NULLs in the table stopped decreasing but that does not seem like a good way of doing it.
You can use a cumulative max for this. If I understand correctly, you want to reset NULL values on rows with cd = 30 to latest start value when cd = '30':
with toupdate as (
select t.*,
max(start) over (partition by patientid order by cast(claimno AS int)) as imputed_start
from t
where cd = '30'
)
update toupdate
set start = imputed_start
where start is null or imputed_start <> start;
I think the simplest way to update the whole table at once would be something like this
with rn_cte as (
select * , row_number() over (partition by patient order by (select null)) rn
from tTable)
update tTable
set start=NULl
where rn>=2;

How to get the validity date range of a price from individual daily prices in SQL

I have some prices for the month of January.
Date,Price
1,100
2,100
3,115
4,120
5,120
6,100
7,100
8,120
9,120
10,120
Now, the o/p I need is a non-overlapping date range for each price.
price,from,To
100,1,2
115,3,3
120,4,5
100,6,7
120,8,10
I need to do this using SQL only.
For now, if I simply group by and take min and max dates, I get the below, which is an overlapping range:
price,from,to
100,1,7
115,3,3
120,4,10
This is a gaps-and-islands problem. The simplest solution is the difference of row numbers:
select price, min(date), max(date)
from (select t.*,
row_number() over (order by date) as seqnum,
row_number() over (partition by price, order by date) as seqnum2
from t
) t
group by price, (seqnum - seqnum2)
order by min(date);
Why this works is a little hard to explain. But if you look at the results of the subquery, you will see how the adjacent rows are identified by the difference in the two values.
SELECT Lag.price,Lag.[date] AS [From], MIN(Lead.[date]-Lag.[date])+Lag.[date] AS [to]
FROM
(
SELECT [date],[Price]
FROM
(
SELECT [date],[Price],LAG(Price) OVER (ORDER BY DATE,Price) AS LagID FROM #table1 A
)B
WHERE CASE WHEN Price <> ISNULL(LagID,1) THEN 1 ELSE 0 END = 1
)Lag
JOIN
(
SELECT [date],[Price]
FROM
(
SELECT [date],Price,LEAD(Price) OVER (ORDER BY DATE,Price) AS LeadID FROM [#table1] A
)B
WHERE CASE WHEN Price <> ISNULL(LeadID,1) THEN 1 ELSE 0 END = 1
)Lead
ON Lag.[Price] = Lead.[Price]
WHERE Lead.[date]-Lag.[date] >= 0
GROUP BY Lag.[date],Lag.[price]
ORDER BY Lag.[date]
Another method using ROWS UNBOUNDED PRECEDING
SELECT price, MIN([date]) AS [from], [end_date] AS [To]
FROM
(
SELECT *, MIN([abc]) OVER (ORDER BY DATE DESC ROWS UNBOUNDED PRECEDING ) end_date
FROM
(
SELECT *, CASE WHEN price = next_price THEN NULL ELSE DATE END AS abc
FROM
(
SELECT a.* , b.[date] AS next_date, b.price AS next_price
FROM #table1 a
LEFT JOIN #table1 b
ON a.[date] = b.[date]-1
)AA
)BB
)CC
GROUP BY price, end_date

T-SQL query to obtain the no of days an item was at the current price

Declare #sec_temp table
(
sec_no varchar(10),
amount money,
price_date date
)
insert #sec_temp
values
('123ABC', 25, '2011-01-20'),
('123ABC', 25, '2011-01-19'),
('123ABC', 25, '2011-01-18'),
('123ABC', 20, '2011-01-15'),
('123ABC', 22, '2011-01-13'),
('456DEF', 22, '2011-01-13')
Problem: To list out the distinct sec_no with the latest price (amount) and the number of days it was at the current price. In this case,
Result:
sec_no amount no_of_days_at_price
123ABC 25 3 e.g. 01-18 to 01-20
456DEF 22 1 e.g. 01-13
select
a.sec_no,
a.amount,
min(price_date) as FirstDateAtPrice,
No_of_days_at_price = COALESCE(DATEDIFF(d, c.price_date, a.price_date),0)
from (
select *, ROW_NUMBER() over (partition by sec_no order by price_date desc) rn
from #sec_temp) a
outer apply (
select top 1 *
from #sec_temp b
where a.sec_no=b.sec_no and a.amount <> b.amount
order by b.price_date desc
) c
where a.rn=1
The subquery A works out the greatest-1-per-group, which is to say the most recent price record for each sec_no. The subquery C finds the first prior record that holds a different price for the same sec_no. The difference in the two dates is the number of days sought. If you need it to be one for no prior date, change the end of the COALESCE line to 1 instead of 0.
EDITED for clarified question
To start counting from the first date equal to the current rate, use this query instead
select
sec_no,
amount,
No_of_days_at_price = 1 + DATEDIFF(d, min(price_date), max(price_date))
from (
select *,
ROW_NUMBER() over (partition by sec_no order by price_date desc) rn,
ROW_NUMBER() over (partition by sec_no, amount order by price_date desc) rn2
from #sec_temp
) X
WHERE rn=rn2
group by sec_no, amount
AND FINALLY If the required result is actually the days between
the first date on which the price is equal to current; and
today
Then the only part to change is this:
No_of_days_at_price = 1 + DATEDIFF(d, min(price_date), getdate())
Here's one approach, first looking up the latest price, and then the last price that was different:
select secs.sec_no
, latest.amount as price
, case when previous.price_date is null then 1
else datediff(day, previous.price_date, latest.price_date)
end as days_at_price
from (
select distinct sec_no
from #sec_temp
) secs
cross apply
(
select top 1 amount
, price_date
from #sec_temp
where sec_no = secs.sec_no
order by
price_date desc
) latest
outer apply
(
select top 1 price_date
from #sec_temp
where sec_no = secs.sec_no
and amount <> latest.amount
order by
price_date desc
) previous
This prints:
sec_no price days_at_price
123ABC 25,00 5
456DEF 22,00 1

SQL Question: Getting Datediff in days elapsed for each record in a group

Given this table:
How can I get the datediff in days between each status_date for each group of ID_Number? In other words I need to find the number of elapsed days for each status that the ID_Number has been given.
Some things to know:
All ID_Number will have a received_date which should be the earliest date for each ID_Number (but app doesn't enforce)
For each ID_Number there will be a status with a corresponding status_date which is the date that the ID_Number was given that particular status.
The status column doesn't always necessarily go in the same order every time (app doesn't enforce)
All ID_Number will have a closed_date which should be the latest date (but app doesn't enforce)
Sample output:
So for ID_Number 2001, the first date (received_date) is 2009-05-02 and the next date you encounter has a status of 'open' and is 2009-05-02 so elapsed days is 0. Moving on to the next date encountered is 2009-05-10 with a status of 'invest' and the elapsed days is 8 counting from the prior date. The next date encountered is 2009-07-11 and the elapsed days is 62 counting from the previous date.
Edited to add:
Is it possible to have the elapsed days end up as a column on this table/view?
I also forgot to add that this is SQL Server 2000.
What I understand is that you need the difference between the first status_date and the next status_date for the same id and so on up to the closed_date.
This will only work in SQL 2005 and up.
;with test as (
select
key,
id_number,
status,
received_date,
status_date,
closed_date,
row_number() over (partition by id order by status_date, key ) as rownum
from #test
)
select
t1.key,
t1.id_number,
t1.status,
t1.status_date,
t1.received_date,
t1.closed_date,
datediff(d, case when t1.rownum = 1
then t1.received_date
else
case when t2.status_date is null
then t1.closed_date
else t2.status_date
end
end,
t1.status_date
) as days
from test t1
left outer join test t2
on t1.id = t2.id
and t2.rownum = t1.rownum - 1
This solution will work with SQL 2000 but I am not sure how good will perform:
select *,
datediff(d,
case when prev_date is null
then closed_date
else prev_date
end,
status_date )
from (
select *,
isnull( ( select top 1 t2.status_date
from #test t2
where t1.id_number = t2.id_number
and t2.status_date < t1.status_date
order by t2.status_date desc
),received_date) as prev_date
from #test t1
) a
order by id_number, status_date
Note: Replace the #Test table with the name of your table.
Some sample output would really help, but this is a guess at what you mean, assuming you want that information for each ID_Number/Status combination:
select ID_Number, Status, EndDate - StartDate as DaysElapsed
from (
select ID_Number, Status, min(coalesce(received_date, status_date)) as StartDate, max(coalesce(closed_date, status_date)) as EndDate
from Table1
group by ID_Number, Status
) a
The tricky bit is determining the previous status and putting it on the same row as the current status. It would be simplified a little if there were a correlation between Key and StatusDate (i.e. that Key(x) > Key(y) always implies StatusDate(x) >= StatusDate(y)). Unfortunately, that doesn't seem to be the case.
PS: I am assuming Key is a unique identifier on your table; you haven't said anything to indicate otherwise.
SELECT Key,
ID_Number,
(
SELECT TOP 1 Key
FROM StatusUpdates prev
WHERE (prev.ID_Number = cur.ID_Number)
AND ( (prev.StatusDate < cur.StatusDate)
OR ( prev.StatusDate = cur.StatusDate
AND prev.Key < cur.Key
)
)
ORDER BY StatusDate, Key /*Consider index on (ID_Number, StatusDate, Key)*/
) PrevKey
FROM StatusUpdates cur
Once you have this as a basis, it's easy to extrapolate to any other info you need from the current or previous StatusUpdate. E.g.
SELECT c.*,
p.Status AS PrevStatus,
p.StatusDate AS PrevStatusDate,
DATEDIFF(d, c.StatusDate, p.StatusDate) AS DaysElapsed
FROM (
SELECT Key,
ID_Number,
Status,
SattusDate,
(
SELECT TOP 1 Key
FROM StatusUpdates prev
WHERE (prev.ID_Number = cur.ID_Number)
AND ( (prev.StatusDate < cur.StatusDate)
OR ( prev.StatusDate = cur.StatusDate
AND prev.Key < cur.Key
)
)
ORDER BY StatusDate, Key
) PrevKey
FROM StatusUpdates cur
) c
JOIN StatusUpdates p ON
p.Key = c.PrevKey