Get rules Entry time and exit time in a row - sql

I am trying to identify the entry and exit time of the data. The calculation of exit time is, If the rule is not in the next activity, then that is the exit time for that rule.
here is how my data looks like.
DECLARE #T AS TABLE
(
Visit_ID INT,
Line INT,
Rule_ID INT,
Activity_DTTM datetime
)
insert into #T VALUES
(123, 1, 100072, '2022-07-01 03:05:00.000' ),
(123, 1, 100173, '2022-07-01 03:05:00.000' ),
(123, 1, 718719, '2022-07-01 03:05:00.000' ),
(123, 2, 100072, '2022-07-01 04:05:00.000' ),
(123, 2, 718719, '2022-07-01 04:05:00.000' ),
(123, 3, 100072, '2022-07-01 06:00:00.000' ),
(123, 4, 100072, '2022-07-02 02:10:00.000' ),
(123, 4, 100173, '2022-07-02 02:10:00.000' )
DECLARE #Desiredresult AS TABLE
(
Visit INT,
Rule_ID INT,
Entry_Time datetime,
Exit_Time datetime
)
insert into #Desiredresult VALUES
(123,100072,'2022-07-01 03:05:00.000', null ),
(123,100173,'2022-07-01 03:05:00.000', '2022-07-01 04:05:00.000'),
(123,718719,'2022-07-01 03:05:00.000', '2022-07-01 06:00:00.000'),
(123,100173,'2022-07-02 02:10:00.000', null)
select * from #Desiredresult
Is this a version of Gaps and Islands problem? I have written a SQL and it was very inefficient where it's taking more than 2hrs. to process 10 million rows.
This is the query i tried.
select me.Visit_ID
,me.Rule_ID
, MIN(CASE WHEN prev_split.line is null then me.ACTIVITY_DTTM else null end) as ENTRY_DTTM
, MAX(CASE WHEN next_split.line is null
then next_flat.Activity_DTTM
else null end
) as EXIT_DTTM
from #T me
left join #T prev_flat on me.Visit_ID = prev_flat.Visit_ID and me.LINE-1 = prev_flat.line -- minus 1
left join #T next_flat on me.Visit_ID = next_flat.Visit_ID and me.LINE+1 = prev_flat.line -- plus 1
left join #T prev_split on me.Visit_ID = prev_split.Visit_ID and me.RULE_ID = prev_split.RULE_ID and me.LINE-1 = prev_split.line -- minus 1
left join #T next_split on me.Visit_ID = next_split.Visit_ID and me.RULE_ID = next_split.RULE_ID and me.LINE+1 = next_split.line -- plus 1
group by me.Visit_ID
,me.Rule_ID

Related

Having Trouble Summing distinct values

I have a fairly poorly designed DB that I'm trying to pull reports from. I'm attempting to sum the value on the column GuestCount, however with the structure of the joins, i'm getting a cartesian situation that's making the sum inaccurate. I can't use Sum(Distinct) because I'm not trying to sum the distinct values in GuestCount, but rather the sum of distinct rows.
Here's the SQL to set up the Tables:
CREATE TABLE TesttblTransactions (
ID int,
[sysdate] date,
TxnHour tinyint,
Facility nvarchar(50),
TableID int,
[Check] int,
Item int,
Parent int
)
Create Table TesttblTablesGuests (
ID int,
Facility nvarchar(50),
TableID int,
GuestCount tinyint,
TableDate Date
)
Create Table TesttblFacilities (
ID int,
ClientKey nvarchar(50),
Brand nvarchar(50),
OrgFacilityID nvarchar(50),
UnitID smallint
)
INSERT INTO testtbltransactions (
ID,
[Sysdate],
TxnHour,
Facility,
TableID,
[Check],
Item,
Parent
)
VALUES
(
1,
'20221201',
7,
'JOES',
1001,
12345,
8898989,
0
),
(
2,
'20221201',
7,
'JOES',
1001,
12345,
8776767,
1
),
(
3,
'20221201',
7,
'JOES',
1001,
12345,
856643,
0
),
(
4,
'20221201',
7,
'THE DIVE',
1001,
67890,
662342,
0
),
(
5,
'20221201',
7,
'THE DIVE',
1001,
67890,
244234,
0
),
(
6,
'20221201',
7,
'JOES',
1002,
12344,
873323,
0
);
INSERT INTO testtblTablesGuests (
ID,
Facility,
TableID,
GuestCount,
TableDate
)
VALUES
(
1,
'JOES',
1001,
4,
'20221201'
),
(
2,
'THE DIVE',
1001,
1,
'20221201'
),
(
3,
'JOES',
1002,
1,
'20221201'
);
INSERT INTO testtblFacilities (
ID,
ClientKey,
Brand,
OrgFacilityID,
UnitID
)
VALUES
(
1,
'JOES',
'Joes Hospitality Group LLC',
'Joes Bar',
987
),
(
2,
'THE DIVE',
'The Dive Restaurant Group',
'The Dive',
565
);
--Here's the SQL that I need for reporting but can't seem to get working:
Declare #StartDate as Date = '12-1-2022'
Declare #EndDate as Date = '12-1-2022'
--The query we want to work
SELECT
TesttblFacilities.ClientKey,
TesttblFacilities.Brand,
format(testtbltransactions.sysdate,'yyyy-MM-dd') AS [Date],
'H' AS Freqency,
Testtbltransactions.[TxnHour] AS [Hour],
TesttblFacilities.UnitID AS [UnitID],
'Dine In Guest Count' as Metric,
Sum(TesttblTablesGuests.GuestCount) AS [Value]
FROM ((Testtbltransactions
JOIN Testtbltablesguests ON (Testtbltablesguests.TableDate = Testtbltransactions.sysdate) AND (Testtbltransactions.FACILITY = Testtbltablesguests.facility) AND (Testtbltransactions.tableid = Testtbltablesguests.tableid))
JOIN TesttblFacilities ON Testtbltransactions.FACILITY = TesttblFacilities.ClientKey)
Where (((Testtbltransactions.parent)=0))
and Testtbltransactions.sysdate >= #StartDate
and Testtbltransactions.sysdate <= #EndDate
GROUP BY TesttblFacilities.ClientKey, Testtblfacilities.UnitID,TesttblFacilities.Brand, Testtbltransactions.facility, Testtbltransactions.sysdate, Testtbltransactions.TxnHour`
I'm getting 9 and 2, instead of 5 and 1.
In the comments NBK suggested doing a subquery - and it took me a while but I think i found something that works.. . .
Declare #StartDate as Date = '12-1-2022'
Declare #EndDate as Date = '12-1-2022'
Select
t1.txnhour,
t1.facility,
SUM(t1.guestcount) from
(
Select Distinct
TesttblTransactions.TableID as TableID,
testtbltransactions.[txnHour] as txnhour,
testtbltransactions.Facility as Facility,
testtbltablesguests.GuestCount as guestcount,
testtbltransactions.Parent as parent
From TesttblTransactions
Join TesttblTablesGuests on TesttblTablesGuests.TableID = TesttblTransactions.TableID and testtbltablesguests.Facility = TesttblTransactions.Facility
Where (((Testtbltransactions.parent)=0))
and Testtbltransactions.sysdate >= #StartDate
and Testtbltransactions.sysdate <= #EndDate
) T1
Group by t1.Facility, t1.txnhour, t1.Facility
I'm going to continue to refine this, but I think I should be able to move forward with this.

SQL Group By specific column with nullable

Let's say I have this data in my table A:
group_id
type
active
1
A
true
1
B
false
1
C
true
2
null
false
3
B
true
3
C
false
I want to create a query which return the A row if exists (without the type column), else return a row with active false.
For this specific table the result will be:
group_id
active
1
true
2
false
3
false
How can I do this ?
I'm assuming I have to use a GROUP BY but I can't find a way to do it.
Thank you
This is a classic row_number problem, generate a row number based on your ordering criteria, then select just the first row in each grouping.
declare #MyTable table (group_id int, [type] char(1), active bit);
insert into #MyTable (group_id, [type], active)
values
(1, 'A', 1),
(1, 'B', 0),
(1, 'C', 1),
(2, null, 0),
(3, 'B', 1),
(3, 'C', 0);
with cte as (
select *
, row_number() over (
partition by group_id
order by case when [type] = 'A' then 1 else 0 end desc, active asc
) rn
from #MyTable
)
select group_id, active
from cte
where rn = 1
order by group_id;
Returns:
group_id
active
1
1
2
0
3
0
Note: Providing the DDL+DML as I have shown makes it much easier for people to assist.
This should do it. We select all the distinct group_ids and then join our table back to that. There is an ISNULL function that will insert the 'false' when 'A' type records are not found.
DECLARE #tableA TABLE (
group_id int
, [type] nchar(1)
, active nvarchar(10)
);
INSERT INTO #tableA (group_id, [type], active)
VALUES
(1, 'A', 'true')
, (1,'B','false')
, (1,'C', 'false')
, (2, null, 'false')
, (3, 'B', 'true')
, (3, 'C', 'false')
;
SELECT
gid.group_id
, ISNULL(a.active,'false') as active
FROM (SELECT DISTINCT group_id FROM #tableA) as gid
LEFT OUTER JOIN #tableA as a
ON a.group_id = gid.group_id
AND a.type = 'A'

How to get the ID from the table which is not mention EndDate

How to get the name from the table which is not having EndDate
in the above pic i need to get D and G details from the table ,
( To understand mOre:
A, C,D,G are having end date, and A, C are again started, but D and G is not started, so from the query i need to get the name D and G
the code i used is not works for it
DECLARE #T AS TABLE
(
SubInventoryID int ,
SubInventoryName varchar(20),
RolesName varchar(20),
StartDate date,
EndDate date
)
INSERT INTO #T VALUES
(30,'RIF-Teller','Teller', '2016-12-27', '2017-01-23'),
(30,'RIF-Teller','Teller', '2016-12-08', NULL),
(30,'RIF-Teller','Teller', '2017-01-02', '2017-01-05'),
(31,'RIF-Teller','Teller', '2017-01-05', NULL),
(24,'MHQ-Teller','Teller', '2016-09-20', '2017-01-23'),
(24,'MHQ-Teller','Teller', '2016-08-01', '2017-01-05'),
(24,'MHQ-Teller','Teller', '2017-01-05', NULL)
Query
SELECT UP.SubInventoryID,S.SubInventoryName SubInventoryName,RolesName,UP.StartDate StartDate,
UP.EndDate EndDate , case when UP.EndDate IS null then 'Occupied' else 'Closed' End As Vacancy
FROM [View_Alx_UserPosition] UP
Inner join ALX_Branches B ON B.BranchID= UP.BranchID
Inner join ALX_SubInventories S ON S.SubInventoryID=UP.SubInventoryID WHERE UP.RolesName Like '%Teller%'
union
SELECT distinct(UP.SubInventoryID),S.SubInventoryName SubInventoryName, '' FullName, '' RolesName,NUll StartDate,
NUll EndDate,'Free' as vacancy
FROM [View_Alx_UserPosition] UP
Inner join ALX_Branches B ON B.BranchID= UP.BranchID
Inner join ALX_SubInventories S ON S.SubInventoryID=UP.SubInventoryID
WHERE UP.EndDate IS NOT NULL ANd UP.RolesName Like '%Teller%'
AND NOT EXISTS
(
SELECT 1
FROM [View_Alx_UserPosition] UP1
WHERE UP1.SubInventoryID = UP.SubInventoryID
AND UP1.StartDate >= UP.EndDate
-- AND UP1.EndDate IS NOT NULL
)
Update
Create and populate sample table (Please save us this step in your future questions)
DECLARE #T AS TABLE
(
ID int identity(1,1),
Name char(1),
StartDate date,
EndDate date
)
INSERT INTO #T VALUES
('A', '2016-04-04', '2017-04-03'),
('B', '2016-04-04', NULL),
('C', '2016-04-04', '2017-04-03'),
('D', '2016-04-04', '2017-04-03'),
('E', '2016-04-04', NULL),
('F', '2016-04-04', NULL),
('G', '2016-04-04', '2017-04-03'),
('C', '2017-04-03', NULL),
('A', '2017-04-03', NULL)
The query:
SELECT Name
FROM #T vu1
WHERE EndDate IS NOT NULL
AND NOT EXISTS
(
SELECT 1
FROM #T vu2
WHERE vu2.Name = vu1.Name
AND vu2.StartDate >= vu1.EndDate
)
Results:
Name
D
G
First version
Assuming I understand the question, this should do the trick:
SELECT Name
FROM View_User vu1
WHERE EndDate IS NOT NULL
AND NOT EXISTS
(
SELECT 1
FROM View_User vu2
WHERE vu2.Name = vu1.Name
AND vu2.StartDate >= vu1.EndDate
AND vu2.EndDate IS NOT NULL
)

How to match groups of rows taking order into account in TSQL?

I have a table that is storing groups of related rows, the different rows are related via a groupIdentifier column. Groups can be any number of rows in size.
I need to be able to pass in a new set of groups of rows and then find a mapping of new to existing matching groups. The complication is that the order of each row within the group is defined by a rowOrdinal value and must be taken into account. That rowOrdinal value is not always 0 based but the rows within a group are sorted by that value. Also #existingData contains 100s of thousands of potential groups, so the query needs to be performant
Here is an example input dataset:
declare #existingData table (
groupIdentifier int,
rowOrdinal int,
value varchar(1))
insert into #existingData values
(100, 0, 'X'),
(100, 1, 'Y'),
(200, 0, 'A'),
(200, 1, 'B'),
(200, 2, 'C'),
(40, 0, 'X'),
(41, 0, 'Y')
declare #newData table (
groupIdentifier int,
rowOrdinal int,
value varchar(1))
insert into #newData values
(1, 55, 'X'),
(1, 59, 'Y'),
(2, 0, 'Y'),
(2, 1, 'X')
-- #newData group 1 matches to #existingData group 100, #newData group 2 has no match in existingData
The desired result is a result set with two columns, existingGroupIdentifier and newGroupIdentifier. In this case the only result row would be 100, 1. The 100 being the #existingData groupIdentifier and the 1 being the #newData groupIdentifier
Edit
The following is what I have come up with so far, by assuming I will ever have a max group size of N, I can manually copy paste tsql code that uses pivot and temp tables to do the comparison for each group size. BUT, this limits the system to N, seems ugly, and I would prefer a way to do it in a single query if possible
declare #existingData table (
groupIdentifier int,
rowOrdinal int,
value varchar(1))
insert into #existingData values
(100, 0, 'X'),
(100, 1, 'Y'),
(200, 0, 'A'),
(200, 1, 'B'),
(200, 2, 'C'),
(40, 0, 'X'),
(41, 0, 'Y')
declare #newData table (
groupIdentifier int,
rowOrdinal int,
value varchar(1))
insert into #newData values
(1, 55, 'X'),
(1, 59, 'Y'),
(2, 0, 'Y'),
(2, 1, 'X'),
(3, 99, 'Y'),
(5, 4, 'A'),
(5, 10, 'B'),
(5, 200, 'C')
-- First build table of the size of each group, limiting #existingData to only potentially matching groups (have at least one member in common)
declare #potentialGroupsInExistingData table (groupIdentifier int, groupSize int)
insert into #potentialGroupsInExistingData
select
ExistingData.groupIdentifier, COUNT(ExistingData.groupIdentifier)
from
#existingData ExistingData
where
exists (select top 1 * from #newData where value = ExistingData.value)
group by ExistingData.groupIdentifier
declare #groupsInNewData table (groupIdentifier int, groupSize int)
insert into #groupsInNewData
select
NewData.groupIdentifier, COUNT(NewData.groupIdentifier)
from
#newData NewData
group by NewData.groupIdentifier
-- handle groups of size one, this is a simpler case of the pivoting used with more than size 1 groups
-----------------------------------
select
ExistingData.groupIdentifier as ExistingGroupIdentifier,
NewData.groupIdentifier as NewGroupIdentifier
from
#potentialGroupsInExistingData PotentialExistingGroup
cross join #groupsInNewData GroupsInNewData
inner join #existingData ExistingData on
ExistingData.groupIdentifier = PotentialExistingGroup.groupIdentifier
inner join #newData NewData on
NewData.groupIdentifier = GroupsInNewData.groupIdentifier
and NewData.value = ExistingData.value
where
PotentialExistingGroup.groupSize = 1
and GroupsInNewData.groupSize = 1
-- handle groups of size two
-----------------------------------
declare #existingGroupsOfSizeTwo table (groupIdentifier int, valueOne varchar(1), valueTwo varchar(2))
insert into #existingGroupsOfSizeTwo
select
*
from
(select
ExistingData.groupIdentifier,
ExistingData.value,
ROW_NUMBER() over (partition by ExistingData.groupIdentifier order by ExistingData.rowOrdinal desc) as ActualOrdinal
from
#potentialGroupsInExistingData PotentialGroup
inner join #existingData ExistingData on
ExistingData.groupIdentifier = PotentialGroup.groupIdentifier
where
PotentialGroup.groupSize = 2) as T
pivot ( min(value) for T.ActualOrdinal in ([1], [2]) ) as p
declare #newGroupsOfSizeTwo table (groupIdentifier int, valueOne varchar(1), valueTwo varchar(2))
insert into #newGroupsOfSizeTwo
select
*
from
(select
NewData.groupIdentifier,
NewData.value,
ROW_NUMBER() over (partition by NewData.groupIdentifier order by NewData.rowOrdinal desc) as ActualOrdinal
from
#groupsInNewData NewDataGroup
inner join #newData NewData on
NewData.groupIdentifier = NewDataGroup.groupIdentifier
where
NewDataGroup.groupSize = 2) as T
pivot ( min(value) for T.ActualOrdinal in ([1], [2]) ) as p
select
ExistingData.groupIdentifier as ExistingGroupIdentifier,
NewData.groupIdentifier as NewGroupIdentifier
from
#newGroupsOfSizeTwo NewData
inner join #existingGroupsOfSizeTwo ExistingData on
ExistingData.valueOne = NewData.valueOne
and ExistingData.valueTwo = NewData.valueTwo
-- handle groups of size three
-----------------------------------
declare #existingGroupsOfSizeThree table (groupIdentifier int, valueOne varchar(1), valueTwo varchar(1), valueThree varchar(1))
insert into #existingGroupsOfSizeThree
select
*
from
(select
ExistingData.groupIdentifier,
ExistingData.value,
ROW_NUMBER() over (partition by ExistingData.groupIdentifier order by ExistingData.rowOrdinal desc) as ActualOrdinal
from
#potentialGroupsInExistingData PotentialGroup
inner join #existingData ExistingData on
ExistingData.groupIdentifier = PotentialGroup.groupIdentifier
where
PotentialGroup.groupSize = 3) as T
pivot ( min(value) for T.ActualOrdinal in ([1], [2], [3]) ) as p
declare #newGroupsOfSizeThree table (groupIdentifier int, valueOne varchar(1), valueTwo varchar(1), valueThree varchar(1))
insert into #newGroupsOfSizeThree
select
*
from
(select
NewData.groupIdentifier,
NewData.value,
ROW_NUMBER() over (partition by NewData.groupIdentifier order by NewData.rowOrdinal desc) as ActualOrdinal
from
#groupsInNewData NewDataGroup
inner join #newData NewData on
NewData.groupIdentifier = NewDataGroup.groupIdentifier
where
NewDataGroup.groupSize = 3) as T
pivot ( min(value) for T.ActualOrdinal in ([1], [2], [3]) ) as p
select
ExistingData.groupIdentifier as ExistingGroupIdentifier,
NewData.groupIdentifier as NewGroupIdentifier
from
#newGroupsOfSizeThree NewData
inner join #existingGroupsOfSizeThree ExistingData on
ExistingData.valueOne = NewData.valueOne
and ExistingData.valueTwo = NewData.valueTwo
and ExistingData.valueThree = NewData.valueThree
General idea
The given tables can have several rows for the same group ID.
If we had a method to converge the given tables in such a way that they had one row for each group ID plus all the values of the group in one column, then it would become trivial to find all matching groups.
If we did this transformation
#existingData -> #ExistingDataGrouped (ID, DataValues)
#newData -> #NewDataGrouped (ID, DataValues)
then the final query would look like this (note that we are joining on DataValues, not ID):
SELECT
E.ID, N.ID
FROM
#ExistingDataGrouped AS E
INNER JOIN #NewDataGrouped AS N ON N.DataValues = E.DataValues
How to make the grouped tables
convert values into XML (search for "group_concat" for SQL Server, e.g. How to make a query with group_concat in sql server)
use CLR implementation of GroupConcat function with extra parameter for specifying the order. I personally used http://groupconcat.codeplex.com/ and it could be a good start.
Some optimization
If the number of source rows is significant, it is possible to do some preliminary filtering by using CHECKSUM_AGG.
WITH
CTE_ExistingRN
AS
(
SELECT
GroupIdentifier
,ROW_NUMBER() OVER(PARTITION BY GroupIdentifier ORDER BY RowOrdinal) AS rn
,Value
FROM #ExistingData
)
,CTE_NewRN
AS
(
SELECT
GroupIdentifier
,ROW_NUMBER() OVER(PARTITION BY GroupIdentifier ORDER BY RowOrdinal) AS rn
,Value
FROM #NewData
)
,CTE_ExistingAgg
AS
(
SELECT
GroupIdentifier
, CHECKSUM_AGG(CHECKSUM(rn, Value)) AS DataValues
FROM CTE_ExistingRN
GROUP BY GroupIdentifier
)
,CTE_NewAgg
AS
(
SELECT
GroupIdentifier
, CHECKSUM_AGG(CHECKSUM(rn, Value)) AS DataValues
FROM CTE_NewRN
GROUP BY GroupIdentifier
)
SELECT
CTE_ExistingAgg.GroupIdentifier AS ExistingGroupIdentifier
, CTE_NewAgg.GroupIdentifier AS NewGroupIdentifier
FROM
CTE_ExistingAgg
INNER JOIN CTE_NewAgg ON CTE_NewAgg.DataValues = CTE_ExistingAgg.DataValues
;
At first we re-number all rows so that each group starts from 1 (CTE_ExistingRN and CTE_NewRN).
CHECKSUM(rn, Value) returns some integer for each source row taking into account the row number and its value. Different values would usually produce different checksums.
CHECKSUM_AGG groups all checksums together.
Result set:
ExistingGroupIdentifier NewGroupIdentifier
100 1
100 2
This result would contain all groups that match exactly (100, 1), and it also can contain some groups that do not match, but by chance their checksums happened to be the same (100, 2). That's why this step is preliminary. To get accurate results you should compare actual values, not their checksums. But this step may filter out a significant number of groups that definitely don't match.
Solution using XML
This solution converts values of each group into XML and would provide accurate results. I personally never used FOR XML before and was curious to see how it works.
WITH
CTE_ExistingGroups
AS
(
SELECT DISTINCT GroupIdentifier
FROM #ExistingData
)
,CTE_NewGroups
AS
(
SELECT DISTINCT GroupIdentifier
FROM #NewData
)
,CTE_ExistingAgg
AS
(
SELECT
GroupIdentifier
,CA_Data.XML_Value AS DataValues
FROM
CTE_ExistingGroups
CROSS APPLY
(
SELECT Value+','
FROM #ExistingData
WHERE GroupIdentifier = CTE_ExistingGroups.GroupIdentifier
ORDER BY RowOrdinal FOR XML PATH(''), TYPE
) AS CA_XML(XML_Value)
CROSS APPLY
(
SELECT CA_XML.XML_Value.value('.', 'NVARCHAR(MAX)')
) AS CA_Data(XML_Value)
)
,CTE_NewAgg
AS
(
SELECT
GroupIdentifier
,CA_Data.XML_Value AS DataValues
FROM
CTE_NewGroups
CROSS APPLY
(
SELECT Value+','
FROM #NewData
WHERE GroupIdentifier = CTE_NewGroups.GroupIdentifier
ORDER BY RowOrdinal FOR XML PATH(''), TYPE
) AS CA_XML(XML_Value)
CROSS APPLY
(
SELECT CA_XML.XML_Value.value('.', 'NVARCHAR(MAX)')
) AS CA_Data(XML_Value)
)
SELECT
CTE_ExistingAgg.GroupIdentifier AS ExistingGroupIdentifier
, CTE_NewAgg.GroupIdentifier AS NewGroupIdentifier
FROM
CTE_ExistingAgg
INNER JOIN CTE_NewAgg ON CTE_NewAgg.DataValues = CTE_ExistingAgg.DataValues
;
Result set:
ExistingGroupIdentifier NewGroupIdentifier
100 1
Try this:
declare #existingData table (
groupIdentifier int,
rowOrdinal int,
value varchar(1))
insert into #existingData values
(100, 0, 'X'),
(100, 1, 'Y'),
(200, 0, 'A'),
(200, 1, 'B'),
(200, 2, 'C'),
(40, 0, 'X'),
(41, 0, 'Y')
declare #newData table (
groupIdentifier int,
rowOrdinal int,
value varchar(1))
insert into #newData values
(1, 55, 'X'),
(1, 59, 'Y'),
(2, 0, 'Y'),
(2, 1, 'X')
declare #results table (
existingGID int,
newGID int)
DECLARE #existingGroupID int
DECLARE outer_cursor CURSOR FOR
SELECT DISTINCT groupIdentifier FROM #existingData
OPEN outer_cursor
FETCH NEXT FROM outer_cursor INTO #existingGroupID
WHILE ##FETCH_STATUS = 0
BEGIN
DECLARE #existingGroupCount int
SELECT #existingGroupCount = COUNT(value) FROM #existingData WHERE groupIdentifier = #existingGroupID
DECLARE #newGroupID int
DECLARE inner_cursor CURSOR FOR
SELECT DISTINCT groupIdentifier from #newData
OPEN inner_cursor
FETCH NEXT FROM inner_cursor INTO #newGroupID
WHILE ##FETCH_STATUS = 0
BEGIN
DECLARE #newGroupCount int
SELECT #newGroupCount = COUNT(value) FROM #newData WHERE groupIdentifier = #newGroupID
-- if groups are different sizes, skip
IF #newGroupCount = #existingGroupCount
BEGIN
DECLARE #newStart int = -1
DECLARE #currentValue varchar(1)
DECLARE #validGroup bit = 1
DECLARE equality_cursor CURSOR FOR
SELECT value FROM #existingData WHERE groupIdentifier = #existingGroupID ORDER BY rowOrdinal
OPEN equality_cursor
FETCH NEXT FROM equality_cursor INTO #currentValue
WHILE ##FETCH_STATUS = 0
BEGIN
DECLARE #newValue varchar(1)
SELECT TOP 1 #newValue = value, #newStart = rowOrdinal FROM #newData WHERE groupIdentifier = #newGroupID AND #newStart < rowOrdinal ORDER BY rowOrdinal
IF(#newValue <> #currentValue)
BEGIN
SET #validGroup = 0
BREAK
END
FETCH NEXT FROM equality_cursor INTO #currentValue
END
CLOSE equality_cursor
DEALLOCATE equality_cursor
IF #validGroup = 1
BEGIN
INSERT INTO #results (existingGID, newGID) VALUES (#existingGroupID, #newGroupID)
END
END
FETCH NEXT FROM inner_cursor INTO #newGroupID
END
CLOSE inner_cursor
DEALLOCATE inner_cursor
FETCH NEXT FROM outer_cursor INTO #existingGroupID
END
CLOSE outer_cursor
DEALLOCATE outer_cursor
SELECT * FROM #results
I need to get going, but I'll edit this later with better comments to explain what the code does.

Need approach for working with small subsets of a large dataset

I am facing a conceptual problem that I am having a hard time overcoming. I am hoping the SO folks can help me overcome it with a nudge in the right direction.
I am in the process of doing some ETL work with the source data being very similar and very large. I am loading it into a table that is intended for replication and I only want the most basic of information in this target table.
My source table looks something like this:
I need my target table to reflect it as such:
As you can see I didn't duplicate the InTransit status where it was duplicated in the source table. The steps I am trying to figure out how to achieve are
Get any new distinct rows entered since the last time the query ran. (Easy)
For each TrackingId I need to check if each new status is already the most recent status in the target and if so disregard otherwise go ahead and insert it. Which this means I have to also start at the earliest of the new statuses and go from there. (I have no *(!#in clue how I'll do this)
Do this every 15 minutes so that statuses are kept very recent so step #2 must be performant.
My source table could easily consist of 100k+ rows but having the need to run this every 15 minutes requires me to make sure this is very performant thus why I am really trying to avoid cursors.
Right now the only way I can see to do this is using a CLR sproc but I think there may be better ways thus I am hoping you guys can nudge me in the right direction.
I am sure I am probably leaving something out that you may need so please let me know what info you may need and I'll happily provide.
Thank you in advance!
EDIT:
Ok I wasn't explicit enough in my question. My source table is going to contain multiple tracking Ids. It may be up to 100k+ rows containing mulitple TrackingId's and multiple statuses for each trackingId. I have to update the target table as above for each individual tracking Id but my source will be an amalgam of trackingId's.
Here's a solution without self-joins:
WITH q AS
(
SELECT *,
ROW_NUMBER() OVER (ORDER BY statusDate) AS rn,
ROW_NUMBER() OVER (PARTITION BY status ORDER BY statusDate) AS rns
FROM tracking
WHERE tackingId = #id
),
qs AS
(
SELECT *,
ROW_NUMBER() OVER (PARTITION BY rn - rns ORDER BY statusDate) AS rnn
FROM q
)
SELECT *
FROM qs
WHERE rnn = 1
ORDER BY
statusDate
Here's a script to check:
DECLARE #tracking TABLE
(
id INT NOT NULL PRIMARY KEY,
trackingId INT NOT NULL,
status INT,
statusDate DATETIME
)
INSERT
INTO #tracking
SELECT 1, 1, 1, DATEADD(d, 1, '2010-01-01')
UNION ALL
SELECT 2, 1, 2, DATEADD(d, 2, '2010-01-01')
UNION ALL
SELECT 3, 1, 2, DATEADD(d, 3, '2010-01-01')
UNION ALL
SELECT 4, 1, 2, DATEADD(d, 4, '2010-01-01')
UNION ALL
SELECT 5, 1, 3, DATEADD(d, 5, '2010-01-01')
UNION ALL
SELECT 6, 1, 3, DATEADD(d, 6, '2010-01-01')
UNION ALL
SELECT 7, 1, 4, DATEADD(d, 7, '2010-01-01')
UNION ALL
SELECT 8, 1, 2, DATEADD(d, 8, '2010-01-01')
UNION ALL
SELECT 9, 1, 2, DATEADD(d, 9, '2010-01-01')
UNION ALL
SELECT 10, 1, 1, DATEADD(d, 10, '2010-01-01')
;
WITH q AS
(
SELECT *,
ROW_NUMBER() OVER (ORDER BY statusDate) AS rn,
ROW_NUMBER() OVER (PARTITION BY status ORDER BY statusDate) AS rns
FROM #tracking
),
qs AS
(
SELECT *,
ROW_NUMBER() OVER (PARTITION BY rn - rns ORDER BY statusDate) AS rnn
FROM q
)
SELECT *
FROM qs
WHERE rnn = 1
ORDER BY
statusDate
Here you go. I'll let you clean it up and do optimizations. one of the sub queries can go into a view and the messy date comparison can be cleaned up. If you're using SQL 2008 R2 then use CAST as DATE instead.
declare #tbl1 table(
id int, Trackingid int, Status varchar(50), StatusDate datetime
)
declare #tbl2 table(
id int, Trackingid int, Status varchar(50), StatusDate datetime
)
----Source data
insert into #tbl1 (id, trackingid, status, statusdate) values(1,1,'PickedUp','10/01/10 1:00') --
insert into #tbl1 (id, trackingid, status, statusdate) values(2,1,'InTransit','10/02/10 1:00') --
insert into #tbl1 (id, trackingid, status, statusdate) values(8,1,'InTransit','10/02/10 3:00')
insert into #tbl1 (id, trackingid, status, statusdate) values(4,1,'Delayed','10/03/10 1:00')
insert into #tbl1 (id, trackingid, status, statusdate) values(5,1,'InTransit','10/03/10 1:01')
insert into #tbl1 (id, trackingid, status, statusdate) values(6,1,'AtDest','10/03/10 2:00')
insert into #tbl1 (id, trackingid, status, statusdate) values(7,1,'Deliv','10/03/10 3:00') --
insert into #tbl1 (id, trackingid, status, statusdate) values(3,2,'InTransit','10/03/10 1:00')
insert into #tbl1 (id, trackingid, status, statusdate) values(9,2,'AtDest','10/04/10 1:00')
insert into #tbl1 (id, trackingid, status, statusdate) values(10,2,'Deliv','10/04/10 1:05')
insert into #tbl1 (id, trackingid, status, statusdate) values(11,1,'Delayed','10/02/10 2:05')
----Target data
insert into #tbl2 (id, trackingid, status, statusdate) values(1,1,'PickedUp','10/01/10 1:00')
insert into #tbl2 (id, trackingid, status, statusdate) values(2,1,'InTransit','10/02/10 1:00')
insert into #tbl2 (id, trackingid, status, statusdate) values(3,1,'Deliv','10/03/10 3:00')
select d.* from
(
select
* ,
ROW_NUMBER() OVER(PARTITION BY trackingid, CAST((STR( YEAR( statusdate ) ) + '/' +STR( MONTH(statusdate ) ) + '/' +STR( DAY( statusdate ) )) AS DATETIME) ORDER BY statusdate) AS 'RN'
from #tbl1
) d
where
not exists
(
select RN from
(
select
* ,
ROW_NUMBER() OVER(PARTITION BY trackingid, CAST((STR( YEAR( statusdate ) ) + '/' +STR( MONTH(statusdate ) ) + '/' +STR( DAY( statusdate ) )) AS DATETIME) ORDER BY statusdate) AS 'RN'
from #tbl1
)f where f.RN = d.RN + 1 and d.status = f.status and f.trackingid = d.trackingid and
CAST((STR( YEAR( f.statusdate ) ) + '/' +STR( MONTH(f.statusdate ) ) + '/' +STR( DAY( f.statusdate ) )) AS DATETIME) =
CAST((STR( YEAR( d.statusdate ) ) + '/' +STR( MONTH(d.statusdate ) ) + '/' +STR( DAY( d.statusdate ) )) AS DATETIME)
)
and
not exists
(
select 1 from #tbl2 t2
where (t2.trackingid = d.trackingid
and t2.statusdate = d.statusdate
and t2.status = d.status)
)
and (
not exists
(
select 1 from
(
select top 1 * from #tbl2 t2
where t2.trackingid = d.trackingid
order by t2.statusdate desc
) g
where g.status = d.status
)
or not exists
(
select 1 from
(
select top 1 * from #tbl2 t2
where t2.trackingid = d.trackingid
and t2.statusdate <= d.statusdate
order by t2.statusdate desc
) g
where g.status = d.status
)
)
order by trackingid,statusdate
How well this performs will depend on indexes, and particularly if you are targeting a single TrackingID at a time, but this is one way to use a CTE and self-join to obtain the desired results:
CREATE TABLE #foo
(
TrackingID INT,
[Status] VARCHAR(32),
StatusDate SMALLDATETIME
);
INSERT #foo SELECT 1, 'PickedUp', '2010-10-01 08:15';
INSERT #foo SELECT 1, 'InTransit', '2010-10-02 03:07';
INSERT #foo SELECT 1, 'InTransit', '2010-10-02 10:28';
INSERT #foo SELECT 1, 'Delayed', '2010-10-03 09:52';
INSERT #foo SELECT 1, 'InTransit', '2010-10-03 20:09';
INSERT #foo SELECT 1, 'AtDest', '2010-10-04 13:42';
INSERT #foo SELECT 1, 'Deliv', '2010-10-04 17:05';
WITH src AS
(
SELECT
TrackingID,
[Status],
StatusDate,
ab = ROW_NUMBER() OVER (ORDER BY [StatusDate])
FROM #foo
WHERE TrackingID = 1
),
realsrc AS
(
SELECT
a.TrackingID,
leftrow = a.ab,
rightrow = b.ab,
leftstatus = a.[Status],
leftstatusdate = a.StatusDate,
rightstatus = b.[Status],
rightstatusdate = b.StatusDate
FROM src AS a
LEFT OUTER JOIN src AS b
ON a.ab = b.ab - 1
)
SELECT
Id = ROW_NUMBER() OVER (ORDER BY [leftstatusdate]),
TrackingID,
[Status] = leftstatus,
[StatusDate] = leftstatusdate
FROM
realsrc
WHERE
rightrow IS NULL
OR (leftrow = rightrow - 1 AND leftstatus <> rightstatus)
ORDER BY
[StatusDate];
GO
DROP TABLE #foo;
If you need to support multiple TrackingIDs in the same query:
CREATE TABLE #foo
(
TrackingID INT,
[Status] VARCHAR(32),
StatusDate SMALLDATETIME
);
INSERT #foo SELECT 1, 'PickedUp', '2010-10-01 08:15';
INSERT #foo SELECT 1, 'InTransit', '2010-10-02 03:07';
INSERT #foo SELECT 1, 'InTransit', '2010-10-02 10:28';
INSERT #foo SELECT 1, 'Delayed', '2010-10-03 09:52';
INSERT #foo SELECT 1, 'InTransit', '2010-10-03 20:09';
INSERT #foo SELECT 1, 'AtDest', '2010-10-04 13:42';
INSERT #foo SELECT 1, 'Deliv', '2010-10-04 17:05';
INSERT #foo SELECT 2, 'InTransit', '2010-10-02 10:28';
INSERT #foo SELECT 2, 'Delayed', '2010-10-03 09:52';
INSERT #foo SELECT 2, 'InTransit', '2010-10-03 20:09';
INSERT #foo SELECT 2, 'AtDest', '2010-10-04 13:42';
WITH src AS
(
SELECT
TrackingID,
[Status],
StatusDate,
ab = ROW_NUMBER() OVER (ORDER BY [StatusDate])
FROM #foo
),
realsrc AS
(
SELECT
a.TrackingID,
leftrow = a.ab,
rightrow = b.ab,
leftstatus = a.[Status],
leftstatusdate = a.StatusDate,
rightstatus = b.[Status],
rightstatusdate = b.StatusDate
FROM src AS a
LEFT OUTER JOIN src AS b
ON a.ab = b.ab - 1
AND a.TrackingID = b.TrackingID
)
SELECT
Id = ROW_NUMBER() OVER (ORDER BY TrackingID, [leftstatusdate]),
TrackingID,
[Status] = leftstatus,
[StatusDate] = leftstatusdate
FROM
realsrc
WHERE
rightrow IS NULL
OR (leftrow = rightrow - 1 AND leftstatus <> rightstatus)
ORDER BY
TrackingID,
[StatusDate];
GO
DROP TABLE #foo;
If this is SQL 2005 then you can use ROW_NUMBER with a sub query or CTE:
If the dataset is really huge though and performance is an issue then one of the above that got pasted while I was trying to get the code block to work could well be more efficient.
/**
* This is just to create a sample table to use in the test query
**/
DECLARE #test TABLE(ID INT, TrackingID INT, Status VARCHAR(20), StatusDate DATETIME)
INSERT #test
SELECT 1,1,'PickedUp', '01 jan 2010 08:00' UNION
SELECT 2,1,'InTransit', '01 jan 2010 08:01' UNION
SELECT 3,1,'InTransit', '01 jan 2010 08:02' UNION
SELECT 4,1,'Delayed', '01 jan 2010 08:03' UNION
SELECT 5,1,'InTransit', '01 jan 2010 08:04' UNION
SELECT 6,1,'AtDest', '01 jan 2010 08:05' UNION
SELECT 7,1,'Deliv', '01 jan 2010 08:06'
/**
* This would be the select code to exclude the duplicate entries.
* Sorting desc in row_number would get latest instead of first
**/
;WITH n AS
(
SELECT ID,
TrackingID,
Status,
StatusDate,
--For each Status for a tracking ID number by ID (could use date but 2 may be the same)
ROW_NUMBER() OVER(PARTITION BY TrackingID, Status ORDER BY ID) AS [StatusNumber]
FROM #test
)
SELECT ID,
TrackingID,
Status,
StatusDate
FROM n
WHERE StatusNumber = 1
ORDER BY ID
I think this example will do what you're looking for:
CREATE TABLE dbo.srcStatus (
Id INT IDENTITY(1,1),
TrackingId INT NOT NULL,
[Status] VARCHAR(10) NOT NULL,
StatusDate DATETIME NOT NULL
);
CREATE TABLE dbo.tgtStatus (
Id INT IDENTITY(1,1),
TrackingId INT NOT NULL,
[Status] VARCHAR(10) NOT NULL,
StatusDate DATETIME NOT NULL
);
INSERT INTO dbo.srcStatus ( TrackingId, [Status], StatusDate ) VALUES ( 1,'PickedUp','10/1/2010 8:15 AM');
INSERT INTO dbo.srcStatus ( TrackingId, [Status], StatusDate ) VALUES ( 1,'InTransit','10/2/2010 3:07 AM');
INSERT INTO dbo.srcStatus ( TrackingId, [Status], StatusDate ) VALUES ( 1,'InTransit','10/2/2010 10:28 AM');
INSERT INTO dbo.srcStatus ( TrackingId, [Status], StatusDate ) VALUES ( 2,'PickedUp','10/1/2010 8:15 AM');
INSERT INTO dbo.srcStatus ( TrackingId, [Status], StatusDate ) VALUES ( 2,'InTransit','10/2/2010 3:07 AM');
INSERT INTO dbo.srcStatus ( TrackingId, [Status], StatusDate ) VALUES ( 2,'Delayed','10/2/2010 10:28 AM');
INSERT INTO dbo.srcStatus ( TrackingId, [Status], StatusDate ) VALUES ( 1,'Delayed','10/3/2010 9:52 AM');
INSERT INTO dbo.srcStatus ( TrackingId, [Status], StatusDate ) VALUES ( 1,'InTransit','10/3/2010 8:09 PM');
INSERT INTO dbo.srcStatus ( TrackingId, [Status], StatusDate ) VALUES ( 1,'AtDest','10/4/2010 1:42 PM');
INSERT INTO dbo.srcStatus ( TrackingId, [Status], StatusDate ) VALUES ( 1,'Deliv','10/4/2010 5:05 PM');
INSERT INTO dbo.srcStatus ( TrackingId, [Status], StatusDate ) VALUES ( 2,'InTransit','10/3/2010 9:52 AM');
INSERT INTO dbo.srcStatus ( TrackingId, [Status], StatusDate ) VALUES ( 2,'InTransit','10/3/2010 8:09 PM');
INSERT INTO dbo.srcStatus ( TrackingId, [Status], StatusDate ) VALUES ( 2,'AtDest','10/4/2010 1:42 PM');
INSERT INTO dbo.srcStatus ( TrackingId, [Status], StatusDate ) VALUES ( 2,'Deliv','10/4/2010 5:05 PM');
WITH cteSrcTrackingIds
AS ( SELECT DISTINCT
TrackingId
FROM dbo.srcStatus
),
cteAllTrackingIds
AS ( SELECT TrackingId ,
[Status] ,
StatusDate
FROM dbo.srcStatus
UNION
SELECT tgtStatus.TrackingId ,
tgtStatuS.[Status] ,
tgtStatus.StatusDate
FROM cteSrcTrackingIds
INNER JOIN dbo.tgtStatus ON cteSrcTrackingIds.TrackingId = tgtStatus.TrackingId
),
cteAllTrackingIdsWithRownums
AS ( SELECT TrackingId ,
[Status] ,
StatusDate ,
ROW_NUMBER() OVER ( PARTITION BY TrackingId ORDER BY StatusDate ) AS rownum
FROM cteAllTrackingIds
),
cteTrackingIdsWorkingSet
AS ( SELECT src.rownum AS [id] ,
src2.rownum AS [id2] ,
src.TrackingId ,
src.[Status] ,
src.StatusDate ,
ROW_NUMBER() OVER ( PARTITION BY src.TrackingId,
src.rownum ORDER BY src.StatusDate ) AS rownum
FROM cteAllTrackingIdsWithRownums AS [src]
LEFT OUTER JOIN cteAllTrackingIdsWithRownums AS [src2] ON src.TrackingId = src2.TrackingId
AND src.rownum < src2.rownum
AND src.[Status] != src2.[Status]
),
cteTrackingIdsSubset
AS ( SELECT id ,
TrackingId ,
[Status] ,
StatusDate ,
ROW_NUMBER() OVER ( PARTITION BY TrackingId, id2 ORDER BY id ) AS rownum
FROM cteTrackingIdsWorkingSet
WHERE rownum = 1
)
INSERT INTO dbo.tgtStatus
( TrackingId ,
[status] ,
StatusDate
)
SELECT cteTrackingIdsSubset.TrackingId ,
cteTrackingIdsSubset.[status] ,
cteTrackingIdsSubset.StatusDate
FROM cteTrackingIdsSubset
LEFT OUTER JOIN dbo.tgtStatus ON cteTrackingIdsSubset.TrackingId = tgtStatus.TrackingId
AND cteTrackingIdsSubset.[status] = tgtStatus.[status]
AND cteTrackingIdsSubset.StatusDate = tgtStatus.StatusDate
WHERE cteTrackingIdsSubset.rownum = 1
AND tgtStatus.id IS NULL
ORDER BY cteTrackingIdsSubset.TrackingId ,
cteTrackingIdsSubset.StatusDate;