SQL Group By Problem - sql

I have a table that has 3 cols namely points, project_id and creation_date. every time points are assigned a new record has been made, for example.
points = 20 project_id = 441 creation_date = 04/02/2011 -> Is one record
points = 10 project_id = 600 creation_date = 04/02/2011 -> Is another record
points = 5 project_id = 441 creation_dae = 06/02/2011 -> Is final record
(creation_date is the date on which record is entered and it is achieved by setting the default value to GETDATE())
now the problem is I want to get MAX points grouped by project_id but I also want creation_date to appear with it so I can use it for another purpose, if creation date is repeating its ok and I cannot group by creation_date because if I do so it will skip the points of project with id 600 and its wrong because id 600 is a different project and its only max points are 10 so it should be listed and its only possible if I do the grouping using project_id but then how should I also list creation_date
So far I am using this query to get MAX points of each project
SELECT MAX(points) AS points, project_id
FROM LogiCpsLogs AS LCL
WHERE (writer_id = #writer_id) AND (DATENAME(mm, GETDATE()) = DATENAME(mm, creation_date)) AND (points <> 0)
GROUP BY project_id
writer_id is the ID of writer whose points I want to see, like writer_id = 1, 2 or 3.
This query brings the result of current month only but I would like to list creation_date as well. Please help.

The subquery way
SELECT P.Project_ID, P.Creation_Date, T.Max_Points
FROM Projects P INNER JOIN
(
SELECT Project_ID, MAX(Points) AS Max_Points
FROM Projects
GROUP BY Project_ID
) T
ON P.Project_ID = T.Project_ID
AND P.Points = T.Max_Points
Please see comment: this will give you ALL days where max-points was achieved. If you only just want one, the query will be more complex.
Edits:
Misread requirements. Added additional constraint.

I'll give you sample..
SELECT MAX(POINTS),
PROJECT_ID,
CREATION_DATE
FROM yourtable
GROUP by CREATION_DATE,PROJECT_ID;

This should be what you want, you don't even need a group by or aggravate functions:
SELECT points, project_id, created_date
FROM #T AS LCL
WHERE writer_id = #writer_id AND points <> 0
AND NOT EXISTS (
SELECT TOP 1 1
FROM #T AS T2
WHERE T2.writer_id = #writer_id
AND T2.project_id = LCL.project_id
AND T2.points > LCL.points)
Where #T is your table, also if you want to only show the records where they were the total in general and not the total for just this given #writer_id then remove the restriction T2.writer_id = #writer_id from the inner query
And my code that I used to test:
DECLARE #T TABLE
(
writer_id int,
points int,
project_id int,
created_date datetime
)
INSERT INTO #T VALUES(1, 20, 441, CAST('20110204' AS DATETIME))
INSERT INTO #T VALUES(1, 10, 600, CAST('20110204' AS DATETIME))
INSERT INTO #T VALUES(1, 5, 441, CAST('20110202' AS DATETIME))
INSERT INTO #T VALUES(1, 15, 241, GETDATE())
INSERT INTO #T VALUES(1, 12, 241, GETDATE())
INSERT INTO #T VALUES(2, 12, 241, GETDATE())
SELECT * FROM #T
DECLARE #writer_id int = 1
My results:
Result Set (3 items)
points | project_id | created_date
20 | 441 | 04/02/2011 00:00:00
10 | 600 | 04/02/2011 00:00:00
15 | 241 | 21/09/2011 18:59:31

My solution use CROSS APPLY sub-queries.
For optimal performance I have created an index on project_id (ASC) & points (DESC sorting order) fields.
If you want to see all creation_date values that have maximum points then you can use WITH TIES:
CREATE TABLE dbo.Project
(
project_id INT PRIMARY KEY
,name NVARCHAR(100) NOT NULL
);
CREATE TABLE dbo.ProjectActivity
(
project_activity INT IDENTITY(1,1) PRIMARY KEY
,project_id INT NOT NULL REFERENCES dbo.Project(project_id)
,points INT NOT NULL
,creation_date DATE NOT NULL
);
CREATE INDEX IX_ProjectActivity_project_id_points_creation_date
ON dbo.ProjectActivity(project_id ASC, points DESC)
INCLUDE (creation_date);
GO
INSERT dbo.Project
VALUES (1, 'A'), (2, 'BB'), (3, 'CCC');
INSERT dbo.ProjectActivity (project_id, points, creation_date)
VALUES (1,100,'2011-01-01'), (1,110,'2011-02-02'), (1, 111, '2011-03-03'), (1, 111, '2011-04-04')
,(2, 20, '2011-02-02'), (2, 22, '2011-03-03')
,(3, 2, '2011-03-03');
SELECT p.*, ca.*
FROM dbo.Project p
CROSS APPLY
(
SELECT TOP(1) WITH TIES
pa.points, pa.creation_date
FROM dbo.ProjectActivity pa
WHERE pa.project_id = p.project_id
ORDER BY pa.points DESC
) ca;
DROP TABLE dbo.ProjectActivity;
DROP TABLE dbo.Project;

Related

join equal or max

I want to join two tables by 'EmployeeId' and 'CalendarMonthId', but problem is that I want to connect record with the bigest CalednarMonthId if not exist equal. I use Azure Synapse SQL pool.
Example data and code
create table emp_contr (
EmployeeId int,
CalendarMonthId int,
IsDeleted int
--login varchar(20)
)
create table contr (
ContractId int,
EmployeeId int,
CalendarMonthId int,
value int
)
insert into emp_contr values (1, 202201, 0)
insert into emp_contr values (1, 202202, 0)
insert into emp_contr values (1, 202205, 0)
insert into emp_contr values (1, 202206, 0)
insert into emp_contr values (2, 202202, 0)
insert into emp_contr values (2, 202203, 0)
insert into contr values (1, 1, 202201, 5)
insert into contr values (2, 1, 202202, 2)
insert into contr values (40, 2, 202202, 2)
insert into contr values (50, 2, 202203, 0)
Base on this data I have problem with connect table: emp_contr, row: EmployeeId:1, CalendarMonthId:202205 with row from table contr from the same user and maximum CalendarMonthId:202202. I have query like this, but it doesn't work
select *
from emp_contr ec
join contr c
on ec.EmployeeId = c.EmployeeId
and (ec.CalendarMonthId = c.CalendarMonthId or ec.CalendarMonthId > max(c.CalendarMonthId))
order by ec.EmployeeId
I expect result like this:
EmployeeId
Emp_Contr_CalendarMonthId
Contr_CalendarMonthId
Value
ContractId
1
202206
202202
2
2
1
202205
202202
2
2
1
202202
202202
2
2
1
202201
202201
5
1
2
202203
202203
0
50
2
202202
202202
2
40
I don't know what I should do. Maybe should I add row in temporary table based on contr table with the same values like bigest one but different CalendarMonthId
TL;DR:
select ec.EmployeeId, ec.CalendarMonthId Emp_Contr_CalendarMonthId,
coalesce(c.CalendarMonthId,c2.CalendarMonthId) Contr_CalendarMonthId,
coalesce(c.value,c2.value) value,
coalesce(c.ContractId,c2.ContractId) ContractId,
-- additional columns for debugging
c.CalendarMonthId, c.value, c.ContractId,
c2.CalendarMonthId, c2.value, c2.ContractId
from emp_contr ec
left join contr c
on ec.EmployeeId = c.EmployeeId
and ec.CalendarMonthId = c.CalendarMonthId
join (
select *
from (
select EmployeeId,
CalendarMonthId,
value,
ContractId,
row_number() over (partition by EmployeeId order by CalendarMonthId desc) rn
from contr
) x
where rn = 1
) c2 on ec.EmployeeId = c2.EmployeeId
order by ec.EmployeeId, ec.CalendarMonthId desc
SQL Fiddle
The coalesce assumes all relevant columns are NOT NULL. If this is not the case you'll have to replace them with CASE-expressions in which you check if c.CalendarMonthId is null.
Explanation
The statement joins contr twice.
First using an outer join to obtain the matching results when present, without loosing rows without a match.
The second join is an inner join.
This assumes there is at least one contr row per emp_contr
The join does join an inline view which
uses the window function row_number() to filter only the last row per employee when ordered by CalendarMonthId
The last step is to combine the results, and pick the one from the first join if present and the one from the second join if not.

sql join using recursive cte

Edit: Added another case scenario in the notes and updated the sample attachment.
I am trying to write a sql to get an output attached with this question along with sample data.
There are two table, one with distinct ID's (pk) with their current flag.
another with Active ID (fk to the pk from the first table) and Inactive ID (fk to the pk from the first table)
Final output should return two columns, first column consist of all distinct ID's from the first table and second column should contain Active ID from the 2nd table.
Below is the sql:
IF OBJECT_ID('tempdb..#main') IS NOT NULL DROP TABLE #main;
IF OBJECT_ID('tempdb..#merges') IS NOT NULL DROP TABLE #merges
IF OBJECT_ID('tempdb..#final') IS NOT NULL DROP TABLE #final
SELECT DISTINCT id,
current
INTO #main
FROM tb_ID t1
--get list of all active_id and inactive_id
SELECT DISTINCT active_id,
inactive_id,
Update_dt
INTO #merges
FROM tb_merges
-- Combine where the id from the main table matched to the inactive_id (should return all the rows from #main)
SELECT id,
active_id AS merged_to_id
INTO #final
FROM (SELECT t1.*,
t2.active_id,
Update_dt ,
Row_number()
OVER (
partition BY id, active_id
ORDER BY Update_dt DESC) AS rn
FROM #main t1
LEFT JOIN #merges t2
ON t1.id = t2.inactive_id) t3
WHERE rn = 1
SELECT *
FROM #final
This sql partially works. It doesn't work, where the id was once active then gets inactive.
Please note:
the active ID should return the last most active ID
the ID which doesn't have any active ID should either be null or the ID itself
ID where the current = 0, in those cases active ID should be the ID current in tb_ID
ID's may get interchanged. For example there are two ID's 6 and 7, when 6 is active 7 is inactive and vice versa. the only way to know the most current active state is by the update date
Attached sample might be easy to understand
Looks like I might have to use recursive cte for achieiving the results. Can someone please help?
thank you for your time!
I think you're correct that a recursive CTE looks like a good solution for this. I'm not entirely certain that I've understood exactly what you're asking for, particularly with regard to the update_dt column, just because the data is a little abstract as-is, but I've taken a stab at it, and it does seem to work with your sample data. The comments explain what's going on.
declare #tb_id table (id bigint, [current] bit);
declare #tb_merges table (active_id bigint, inactive_id bigint, update_dt datetime2);
insert #tb_id values
-- Sample data from the question.
(1, 1),
(2, 1),
(3, 1),
(4, 1),
(5, 0),
-- A few additional data to illustrate a deeper search.
(6, 1),
(7, 1),
(8, 1),
(9, 1),
(10, 1);
insert #tb_merges values
-- Sample data from the question.
(3, 1, '2017-01-11T13:09:00'),
(1, 2, '2017-01-11T13:07:00'),
(5, 4, '2013-12-31T14:37:00'),
(4, 5, '2013-01-18T15:43:00'),
-- A few additional data to illustrate a deeper search.
(6, 7, getdate()),
(7, 8, getdate()),
(8, 9, getdate()),
(9, 10, getdate());
if object_id('tempdb..#ValidMerge') is not null
drop table #ValidMerge;
-- Get the subset of merge records whose active_id identifies a "current" id and
-- rank by date so we can consider only the latest merge record for each active_id.
with ValidMergeCTE as
(
select
M.active_id,
M.inactive_id,
[Priority] = row_number() over (partition by M.active_id order by M.update_dt desc)
from
#tb_merges M
inner join #tb_id I on M.active_id = I.id
where
I.[current] = 1
)
select
active_id,
inactive_id
into
#ValidMerge
from
ValidMergeCTE
where
[Priority] = 1;
-- Here's the recursive CTE, which draws on the subset of merges identified above.
with SearchCTE as
(
-- Base case: any record whose active_id is not used as an inactive_id is an endpoint.
select
M.active_id,
M.inactive_id,
Depth = 0
from
#ValidMerge M
where
not exists (select 1 from #ValidMerge M2 where M.active_id = M2.inactive_id)
-- Recursive case: look for records whose active_id matches the inactive_id of a previously
-- identified record.
union all
select
S.active_id,
M.inactive_id,
Depth = S.Depth + 1
from
#ValidMerge M
inner join SearchCTE S on M.active_id = S.inactive_id
)
select
I.id,
S.active_id
from
#tb_id I
left join SearchCTE S on I.id = S.inactive_id;
Results:
id active_id
------------------
1 3
2 3
3 NULL
4 NULL
5 4
6 NULL
7 6
8 6
9 6
10 6

Exclude rows where dates exist in another table

I have 2 tables, one is working pattern, another is absences.
1) Work pattern
ID | Shift Start | Shift End
123| 01-03-2017 | 02-03-2017
2) Absences
ID| Absence Start | Absence End
123| 01-03-2017 | 04-03-2017
What would be the best way, when selecting rows from work pattern, to exclude any that have a date marked as an absence in the absence table?
For example, I have a report that uses the work pattern table to count how may days a week an employee has worked, however I don't want it to include the days that have been marked as an absence on the absence table if that makes sense? Also don't want it to include any days that fall between the absence start and absence end date?
If the span of the absence should always encompass the shift to be excluded you can use not exists():
select *
from WorkPatterns w
where not exists (
select 1
from Absences a
where a.Id = w.Id
and a.AbsenceStart <= w.ShiftStart
and a.AbsenceEnd >= w.ShiftEnd
)
rextester demo: http://rextester.com/DCODC76816
returns:
+-----+------------+------------+
| id | ShiftStart | ShiftEnd |
+-----+------------+------------+
| 123 | 2017-02-27 | 2017-02-28 |
| 123 | 2017-03-05 | 2017-03-06 |
+-----+------------+------------+
given this test setup:
create table WorkPatterns ([id] int, [ShiftStart] datetime, [ShiftEnd] datetime) ;
insert into WorkPatterns ([id], [ShiftStart], [ShiftEnd]) values
(123, '20170227', '20170228')
,(123, '20170301', '20170302')
,(123, '20170303', '20170304')
,(123, '20170305', '20170306')
;
create table Absences ([id] int, [AbsenceStart] datetime, [AbsenceEnd] datetime) ;
insert into Absences ([id], [AbsenceStart], [AbsenceEnd]) values
(123, '20170301', '20170304');
What would be the best way, when selecting rows from work pattern
If you dealing only whit dates (no time) and have control over db schema,
One approach will be to create calendar table ,
Where you going to put all dates since company started and some years in future
Fill that table once.
After it is easy to join other tables whit dates and do math.
If you have trouble whit constructing TSQL query please edit question whit more details about columns and values of tables, relations and needed results.
How about this:
SELECT WP_START.[id], WP_START.[shift_start], WP_START.[shift_end]
FROM work_pattern AS WP_START
INNER JOIN absences AS A ON WP_START.id = A.id
WHERE WP_START.[shift_start] NOT BETWEEN A.[absence_start] AND A.[absence_end]
UNION
SELECT WP_END.[id], WP_END.[shift_start], WP_END.[shift_end]
FROM work_pattern AS WP_END
INNER JOIN absences AS A ON WP_END.id = A.id
WHERE WP_END.[shift_end] NOT BETWEEN A.[absence_start] AND A.[absence_end]
See it on SQL Fiddle: http://sqlfiddle.com/#!6/49ae6/6
Here is my example that includes a Date Dimension table. If your DBAs won't add it, you can create #dateDim as a temp table, like I've done with SQLFiddle (didn't know I could do that). A typical date dimension would have a lot more details you need about the days, but if the table can't be added, just use what you need. You'll have to populate the other Holidays you need. The DateDim I use often is at https://github.com/shawnoden/SQL_Stuff/blob/master/sql_CreateDateDimension.sql
SQL Fiddle
MS SQL Server 2014 Schema Setup:
/* Tables for your test data. */
CREATE TABLE WorkPatterns ( id int, ShiftStart date, ShiftEnd date ) ;
INSERT INTO WorkPatterns ( id, ShiftStart, ShiftEnd )
VALUES
(123, '20170101', '20171031')
, (124, '20170601', '20170831')
;
CREATE TABLE Absences ( id int, AbsenceStart date, AbsenceEnd date ) ;
INSERT INTO Absences ( id, AbsenceStart, AbsenceEnd )
VALUES
( 123, '20170123', '20170127' )
, ( 123, '20170710', '20170831' )
, ( 124, '20170801', '20170820' )
;
/* ******** MAKE SIMPLE CALENDAR TABLE ******** */
CREATE TABLE dateDim (
theDate DATE NOT NULL
, IsWeekend BIT DEFAULT 0
, IsHoliday BIT DEFAULT 0
, IsWorkDay BIT DEFAULT 0
);
/* Populate basic details of dates. */
INSERT dateDim(theDate, IsWeekend, IsHoliday)
SELECT d
, CONVERT(BIT, CASE WHEN DATEPART(dw,d) IN (1,7) THEN 1 ELSE 0 END)
, CONVERT(BIT, CASE WHEN d = '20170704' THEN 1 ELSE 0 END) /* 4th of July. */
FROM (
SELECT d = DATEADD(DAY, rn - 1, '20170101')
FROM
(
SELECT TOP (DATEDIFF(DAY, '20170101', '20171231'))
rn = ROW_NUMBER() OVER (ORDER BY s1.[object_id])
FROM sys.all_objects AS s1
CROSS JOIN sys.all_objects AS s2
ORDER BY s1.[object_id]
) AS x
) AS y ;
/* If not a weekend or holiday, it's a WorkDay. */
UPDATE dateDim
SET IsWorkDay = CASE WHEN IsWeekend = 0 AND IsHoliday = 0 THEN 1 ELSE 0 END
;
Query For Calculation:
SELECT wp.ID, COUNT(d.theDate) AS workDayCount
FROM WorkPatterns wp
INNER JOIN dateDim d ON d.theDate BETWEEN wp.ShiftStart AND wp.ShiftEnd
AND d.IsWorkDay = 1
LEFT OUTER JOIN Absences a ON d.theDate BETWEEN a.AbsenceStart AND a.AbsenceEnd
AND wp.ID = a.ID
WHERE a.ID IS NULL
GROUP BY wp.ID
ORDER BY wp.ID
Results:
| ID | workDayCount |
|-----|--------------|
| 123 | 172 | << 216 total days, 44 non-working
| 124 | 51 | << 65 total days, 14 non-working

How to check the overlapping time intervals from one type 2 SCD dimension

I have one problem identifying and fixing some records having overlapping time intervals, for one scd type 2 dimension.
What I have is:
Bkey Uid startDate endDate
'John' 1 1990-01-01 (some time stamp) 2017-01-10 (some time stamp)
'John' 2 2016-11=03 (some time stamp) 2016-11-14 (some time stamp)
'John' 3 2016-11-14 (some time stamp) 2016-12-29 (some time stamp)
'John' 4 2016-12-29 (some time stamp) 2017-01-10 (some time stamp)
'John' 5 2017-01-10 (some time stamp) 2017-04-22 (some time stamp)
......
I want to find (first) which are all the Johns having overlapping time periods, for a table having lots and lots of Johns and then to figure out a way to correct those overlapping time periods. For the latest I know there are some function LAGG, LEAD, which can handle that, but it eludes me how to find those over lappings.
Any hints?
Regards,
[ 1 ] Following query will return overlapping time ranges:
SELECT *,
(
SELECT *
FROM #Dimension1 y
WHERE x.Bkey = y.Bkey
AND x.Uid <> y.Uid
AND NOT(x.startDate > y.endDate OR x.endDate < y.startDate)
FOR XML RAW, ROOT, TYPE
) OverlappingTimeRanges
FROM #Dimension1 x
Full script:
DECLARE #Dimension1 TABLE (
Bkey VARCHAR(50) NOT NULL,
Uid INT NOT NULL,
startDate DATE NOT NULL,
endDate DATE NOT NULL,
CHECK(startDate < endDate)
);
INSERT #Dimension1
SELECT 'John', 1, '1990-01-01', '2017-01-10' UNION ALL
SELECT 'John', 2, '2016-11-03', '2016-11-14' UNION ALL
SELECT 'John', 3, '2016-11-14', '2016-12-29' UNION ALL
SELECT 'John', 4, '2016-12-29', '2017-01-10' UNION ALL
SELECT 'John', 5, '2017-01-11', '2017-04-22';
SELECT *,
(
SELECT *
FROM #Dimension1 y
WHERE x.Bkey = y.Bkey
AND x.Uid <> y.Uid
AND NOT(x.startDate > y.endDate OR x.endDate < y.startDate)
FOR XML RAW, ROOT, TYPE
) OverlappingTimeRanges
FROM #Dimension1 x
Demo here
[ 2 ] In order to find distinct groups of time ranges with overlapping original rows I would use following approach:
-- Edit 1
DECLARE #Groups TABLE (
Bkey VARCHAR(50) NOT NULL,
Uid INT NOT NULL,
startDateNew DATE NOT NULL,
endDateNew DATE NOT NULL,
CHECK(startDateNew < endDateNew)
);
INSERT #Groups
SELECT x.Bkey, x.Uid, z.startDateNew, z.endDateNew
FROM #Dimension1 x
OUTER APPLY (
SELECT MIN(y.startDate) AS startDateNew, MAX(y.endDate) AS endDateNew
FROM #Dimension1 y
WHERE x.Bkey = y.Bkey
AND NOT(x.startDate > y.endDate OR x.endDate < y.startDate)
) z
-- End of Edit 1
-- This returns distinct groups identified by DistinctGroupId together with all overlapping Uid(s) from current group
SELECT *
FROM (
SELECT ROW_NUMBER() OVER(ORDER BY b.Bkey, b.startDateNew, b.endDateNew) AS DistinctGroupId, b.*
FROM (
SELECT DISTINCT a.Bkey, a.startDateNew, a.endDateNew
FROM #Groups a
) b
) c
OUTER APPLY (
SELECT d.Uid AS Overlapping_Uid
FROM #Groups d
WHERE c.Bkey = d.Bkey
AND c.startDateNew = d.startDateNew
AND c.endDateNew = d.endDateNew
) e
-- This returns distinct groups identified by DistinctGroupId together with an XML (XmlCol) which includes overlapping Uid(s)
SELECT *
FROM (
SELECT ROW_NUMBER() OVER(ORDER BY b.Bkey, b.startDateNew, b.endDateNew) AS DistinctGroupId, b.*
FROM (
SELECT DISTINCT a.Bkey, a.startDateNew, a.endDateNew
FROM #Groups a
) b
) c
OUTER APPLY (
SELECT (
SELECT d.Uid AS Overlapping_Uid
FROM #Groups d
WHERE c.Bkey = d.Bkey
AND c.startDateNew = d.startDateNew
AND c.endDateNew = d.endDateNew
FOR XML RAW, TYPE
) AS XmlCol
) e
Note: Last range used in my example is 'John', 5, '2017-01-11', '2017-04-22'; and not 'John', 5, '2017-01-10', '2017-04-22';. Also, data type used is DATE and not DATETIME[2][OFFSET].
I think the tricky part of your query is being able to articulate the logic for overlapping ranges. We can self join on the condition that a row on the left overlaps with any row on the right. All matching rows are those which overlap.
We can think of four possible overlap scenarios:
|---------| |---------| no overlap
|---------|
|---------| 1st end and 2nd start overlap
|---------|
|---------| 1st start and 2nd end overlap
|---------|
|---| 2nd completely contained inside 1st
(could be 1st inside 2nd also)
SELECT DISTINCT
t.Uid
FROM yourTable t1
INNER JOIN yourTable t2
ON t1.startDate <= t2.endDate AND
t2.startDate <= t1.endDate
WHERE
t1.Bkey = 'John' AND t2.Bkey = 'John'
This will at least let you identify overlapping records. Updating and separating them in a meaningful way will probably end up being an ugly gaps and islands problem, perhaps meriting another question.
we can acheive this by doing a self join of emp table.
a.emp_id != b.emp_id ensures same row is not joined with itself.
remaining comparison clause checks if any row's start date or end date falls in other row's date range.
create table emp(name varchar(20), emp_id numeric(10), start_date date, end_date date);
insert into emp values('John', 1, '1990-01-01', '2017-01-10');
insert into emp values( 'John', 2, '2016-11-03', '2016-11-14');
insert into emp values( 'John', 3, '2016-11-14', '2016-12-29');
insert into emp values( 'John', 4, '2016-12-29', '2017-01-10');
insert into emp values( 'John', 5, '2017-01-11', '2017-04-22');
commit;
with A as (select * from EMP),
B as (select * from EMP)
select A.* from A,B where A.EMP_ID != B.EMP_ID
and A.START_DATE < B.END_DATE and B.START_DATE < A.END_DATE
and (A.START_DATE between B.START_DATE and B.END_DATE
or A.END_DATE between B.START_DATE and B.END_DATE);

Group events by temporal distance in SQL

In general, I need to associate (group) records which are created in similar time periods. If it helps, thinking of the example below as clickstream data where there is no sessionID and I need to build those sessions.
I have the following dataset:
UserId INT,
EventId INT,
DateCreated DATETIME,
BlockId INT
Assume the following data:
{123, 111, '2009-12-01 9:15am', NULL}
{123, 222, '2009-12-01 9:20am', NULL}
{123, 333, '2009-12-01 9:25am', NULL}
{123, 444, '2009-12-03 2:30pm', NULL}
{123, 555, '2009-12-03 2:32pm', NULL}
What I need to do is divide these events up, by user, into temporal buckets. There is a business rule that says anything > 30 minutes should be a new bucket. In the above example, events 111-333 represent a block, i.e. not more than 30 minutes separates them. Likewise, events 444-555 represent a second block.
My current solution uses a cursor and is extremely slow (therefore, unsustainable for the amount of data I need to process). I can post the code but it is pretty simple.
Any ideas?
Hopefully this will get you going in the right direction. If you're in an SP then using table variables for the StartTimes and EndTimes should make the query much easier to read and understand. This will give you start and end times for your batches, then just join back to your table and you should have it.
;WITH StartTimes AS
(
SELECT DISTINCT
T1.DateCreated AS StartTime
FROM
My_Table T1
LEFT OUTER JOIN My_Table T2 ON
T2.UserID = T1.UserID AND
T2.EventID = T1.EventID AND
T2.DateCreated >= DATEADD(mi, -30, T1.DateCreated) AND
T2.DateCreated < T1.DateCreated
WHERE
T2.UserID IS NULL
)
SELECT
StartTimes.StartTime,
EndTimes.EndTime
FROM
(
SELECT DISTINCT
T3.DateCreated AS EndTime
FROM
My_Table T3
LEFT OUTER JOIN My_Table T4 ON
T4.UserID = T3.UserID AND
T4.EventID = T3.EventID AND
T4.DateCreated <= DATEADD(mi, 30, T3.DateCreated) AND
T4.DateCreated > T3.DateCreated
WHERE
T4.UserID IS NULL
) AS ET
INNER JOIN StartTimes ST ON
ST.StartTime <= ET.EndTimes
LEFT OUTER JOIN StartTimes ST2 ON
ST2.StartTime <= ET.EndTimes AND
ST2.StartTime > ST.StartTime
WHERE
ST2.StartTime IS NULL
Based on comment thread,
A. Buckets are defined by the first record in the bucket, and the first record in each Bucket is defined as any row where the DateCreated is more than 30 minutes after the latest earlier DateCreated. (immediately previous record)
B. The rest of the rows in the bucket are all rows with DateCreated on or after the First Row whose DateCreated is less than 30 minutes after the immediately previous row, and there does not exist a non-qualifying, (or new bucket-defining), row since the specified Bucket-defining row.
In English:
Select The DateCreated of those records wheret he DateCreated is more than 30 minutes after the previous DateCreated and aggregate function of your choice on all the other records in table whose DateCreated is after that bucket-defining datecreated, less than 30 minutes after it's immedialte previous DateCreated, and there are no records between the bucket-defining DateCreated and this one which follow a greater than 30 minute gap.
In SQL:
Select Z.BucketDefinitionDate , Count(*) RowsInBucket
From (Select Distinct DateCreated BucketDefinitionDate
From Table Ti
Where DateCreated > DateAdd(minute, 30,
(Select Max(DateCreated) From Table
Where DateCreated < Ti.DateCreated))) Z
Join Table B
On B.DateCreated > Z.BucketDefinitionDate
And Not Exists
(Select * From Table
Where DateCreated Between Z.BucketDefinitionDate
And B.DateCreated
And DateCreated > DateAdd(minute, 30,
(Select Max(DateCreated) From Table
Where DateCreated < B.DateCreated)))
Group By Z.BucketDefinitionDate
What you can try is
DECLARE #TABLE TABLE(
ID INT,
EventID INT,
DateCreated DATETIME
)
INSERT INTO #TABLE SELECT 123, 111, '2009-12-01 9:15am'
INSERT INTO #TABLE SELECT 123, 222, '2009-12-01 9:20am'
INSERT INTO #TABLE SELECT 123, 333, '2009-12-01 9:25am'
INSERT INTO #TABLE SELECT 123, 444, '2009-12-03 2:30pm'
INSERT INTO #TABLE SELECT 123, 555, '2009-12-01 2:32pm'
SELECT ID,
DATEADD(dd, DATEDIFF(dd,0,DateCreated), 0) DayVal,
DATEPART(hh, DateCreated) HourPart,
FLOOR(DATEPART(mi, DateCreated) / 30.) MinBucket
FROM #TABLE
Now you can group by DayVal, HourPart and MinBucket.
I think I have something for you. it is not a cool single query like Tom H posted, but it seems to work. It uses a table variable as a working table.
declare #table table(
id int identity(1,1),
userId int,
eventId int,dateCreated datetime,
bucket int
)
insert into #table select 123, 111, '2009-12-01 9:15am', 0
// etc... insert more rows - note that the 'bucket' field is set to 0
declare #next_bucket int
set #next_bucket = 1
update #table
set bucket = #next_bucket, #next_bucket = #next_bucket + 1
from #table as [current]
where datecreated > dateadd(mi, 30, (select datecreated from #table as previous where [current].id = previous.id + 1))
update #table
set bucket =
coalesce(( select max(bucket)
from #table as previous
where previous.id < [current].id
and bucket <> 0
), 1)
from #table as [current]
where bucket = 0
-- return the results
select * from #table