SQL Last activity of given type - sql

So I have a Visitor table, and a Visitor_activity table. Say:
Visitor
Visitor_ID Int
Visitor_name varchar(20)
Visitor_Activity
ID Int
Visitor_ID Int
Activity_Type char(3) -- values IN or OUT
Activity_Time datetime
Visitors might sign in and out multiple times in a day.
I'd like a nice query to tell me all visitors who are in: i.e. the last activity for today (on activity_time) was an "IN" not an "OUT". Any advice much appreciated.
It's T-SQL by the way, but I think it's more of an in-principle question.

One way to solve this is to use a correlated not exists predicate:
select Activity_Time, Visitor_ID
from Visitor_Activity t1
where Activity_Type = 'IN'
and not exists (
select 1
from Visitor_Activity
where Activity_Type = 'OUT'
and Visitor_ID = t1.Visitor_ID
and Activity_Time > t1.Activity_Time
and cast(Activity_Time as date) = cast(t1.Activity_Time as date)
)
This basically says get all visitor_id that have type = IN for which there doesn't exists any type = OUT record with a later time (on the same date).
Sample SQL Fiddle

SELECT
v.*
FROM
Visitors v
JOIN Visitor_Activity va ON va.Visitor_ID = v.Visitor_ID
WHERE
va.Activity_Type = 'IN'
AND NOT EXISTS ( SELECT
*
FROM
Visitor_Activity va_out
WHERE
va_out.Visitor_ID = va.Visitor_ID
AND va_out.Activity_Type = 'OUT'
AND va_out.Activity_Time > va.Activity_Time )

with visitorsInOut as (
select Visitor_id,
max(case when Activity_Type = 'in' then Activity_Time else null end) inTime,
max(case when Activity_Type = 'out' then Activity_Time else null end) outTime
from Visitor_Activity
where datediff(dd, Activity_Time, getdate()) = 0
group by Visitor_id)
select Visitor_id
from visitorsInOut
where inTime > outTime or outTime is null

This uses a CTE to find the activity record with the greatest Activity_Time where the Activity_Type = 'IN' and assigns it RowNum 1. Then you can INNER JOIN the CTE to the Visitor table, filtering by the CTE results where RowNum = 1.
; WITH VisAct AS(
SELECT act.Visitor_ID
, ROW_NUMBER() OVER(PARTITION BY Visitor_ID ORDER BY Activity_Time DESC) AS RowNum
FROM Visitor_Activity act
WHERE act.Activity_Type = 'IN'
AND act.Activity_Time >= CAST(GETDATE() AS DATE)
)
SELECT vis.Visitor_ID, vis.Visitor_name
FROM Visitor vis
INNER JOIN VisAct act
ON act.Visitor_ID = vis.Visitor_ID
WHERE act.Row_Num = 1

You can pull the most recent action for each visitor, and then only return those where the last action for today was to check in.
SELECT v.Visitor_ID, v.Visitor_Name, va.Activity_Type, va.Activity_Time
FROM Visitor AS v
INNER JOIN (SELECT Visitor_ID, Activity_Type, Activity_Time, RANK() OVER (PARTITION BY Visitor_ID ORDER BY Activity_Time DESC) AS LastAction
FROM Visitor_Activity
-- checks for today, can be omitted if you still want
-- to see someone checked in from yesterday
WHERE DATEDIFF(d, 0, Activity_Time) = DATEDIFF(d, 0, getdate())
) AS va ON va.Visitor_ID = v.Visitor_ID
WHERE LastAction = 1
AND Activity_Type = 'IN'

With CROSS APPLY:
DECLARE #d DATE = '20150320'
DECLARE #v TABLE
(
visitor_id INT ,
visitor_name NVARCHAR(MAX)
)
DECLARE #a TABLE
(
visitor_id INT ,
type CHAR(3) ,
time DATETIME
)
INSERT INTO #v
VALUES ( 1, 'A' ),
( 2, 'B' ),
( 3, 'C' )
INSERT INTO #a
VALUES ( 1, 'in', '2015-03-20 19:32:27.513' ),
( 1, 'out', '2015-03-20 19:32:27.514' ),
( 1, 'in', '2015-03-20 19:32:27.515' ),
( 2, 'in', '2015-03-20 19:32:27.516' ),
( 2, 'out', '2015-03-20 19:32:27.517' ),
( 3, 'in', '2015-03-20 19:32:27.518' ),
( 3, 'out', '2015-03-20 19:32:27.519' ),
( 3, 'in', '2015-03-20 19:32:27.523' )
SELECT *
FROM #v v
CROSS APPLY ( SELECT *
FROM ( SELECT TOP 1
type
FROM #a a
WHERE a.visitor_id = v.visitor_id
AND a.time >= #d
AND a.time < DATEADD(dd, 1, #d)
ORDER BY time DESC
) i
WHERE type = 'in'
) c
Output:
visitor_id visitor_name type
1 A in
3 C in
The principle:
First you are selecting all visitors.
Then you are applying to visitor last activity
SELECT TOP 1
type
FROM #a a
WHERE a.visitor_id = v.visitor_id
AND a.time >= #d
AND a.time < DATEADD(dd, 1, #d)
ORDER BY time DESC
Then you are selecting from previous step in order to get empty set which will filter out visitors whose last activity was not 'in'. If last activity was 'in' you get one row in result and thus applying works. If last activity is 'out' then outer query will result in empty set, and by design CROSS APPLY will eliminate such visitor.

Related

SQL - Return count of consecutive days where value was unchanged

I have a table like
date
ticker
Action
'2022-03-01'
AAPL
BUY
'2022-03-02'
AAPL
SELL.
'2022-03-03'
AAPL
BUY.
'2022-03-01'
CMG
SELL.
'2022-03-02'
CMG
HOLD.
'2022-03-03'
CMG
HOLD.
'2022-03-01'
GPS
SELL.
'2022-03-02'
GPS
SELL.
'2022-03-03'
GPS
SELL.
I want to do a group by ticker then count all the times that Actions have sequentially been the value that they are as of the last date, here it's 2022-03-03. ie for this example table it'd be like;
ticker
NumSequentialDaysAction
AAPL
0
CMG
1
GPS
2
Fine to pass in 2022-03-03 as a value, don't need to figure that out on the fly.
Tried something like this
---Table Creation---
CREATE TABLE UserTable
([Date] DATETIME2, [Ticker] varchar(5), [Action] varchar(5))
;
INSERT INTO UserTable
([Date], [Ticker], [Action])
VALUES
('2022-03-01' , 'AAPL' , 'BUY'),
('2022-03-02' , 'AAPL' , 'SELL'),
('2022-03-03' , 'AAPL' , 'BUY'),
('2022-03-01' , 'CMG' , 'SELL'),
('2022-03-02' , 'CMG' , 'HOLD'),
('2022-03-03' , 'CMG' , 'HOLD'),
('2022-03-01' , 'GPS' , 'SELL'),
('2022-03-02' , 'GPS' , 'SELL'),
('2022-03-03' , 'GPS' , 'SELL')
;
---Attempted Solution---
I'm thinking that I need to do a sub query to get the last value and join on itself to get the matching values. Then apply a window function, ordered by date to see that the proceeding value is sequential.
WITH CTE AS (SELECT Date, Ticker, Action,
ROW_NUMBER() OVER (PARTITION BY Ticker, Action ORDER BY Date) as row_num
FROM UserTable)
SELECT Ticker, COUNT(DISTINCT Date) as count_of_days
FROM CTE
WHERE row_num = 1
GROUP BY Ticker;
WITH CTE AS (SELECT Date, Ticker, Action,
DENSE_RANK() OVER (PARTITION BY Ticker ORDER BY Action,Date) as rank
FROM table)
SELECT Ticker, COUNT(DISTINCT Date) as count_of_days
FROM CTE
WHERE rank = 1
GROUP BY Ticker;
You can do this with the help of the LEAD function like so. You didn't specify which RDBMS you're using. This solution works in PostgreSQL:
WITH "withSequential" AS (
SELECT
ticker,
(LEAD("Action") OVER (PARTITION BY ticker ORDER BY date ASC) = "Action") AS "nextDayIsSameAction"
FROM UserTable
)
SELECT
ticker,
SUM(
CASE
WHEN "nextDayIsSameAction" IS TRUE THEN 1
ELSE 0
END
) AS "NumSequentialDaysAction"
FROM "withSequential"
GROUP BY ticker
Here is a way to do this using gaps and islands solution.
Thanks for sharing the create and insert scripts, which helps to build the solution quickly.
dbfiddle link.
https://dbfiddle.uk/rZLDTrNR
with data
as (
select date
,ticker
,action
,case when lag(action) over(partition by ticker order by date) <> action then
1
else 0
end as marker
from usertable
)
,interim_data
as (
select *
,sum(marker) over(partition by ticker order by date) as grp_val
from data
)
,interim_data2
as (
select *
,count(*) over(partition by ticker,grp_val) as NumSequentialDaysAction
from interim_data
)
select ticker,NumSequentialDaysAction
from interim_data2
where date='2022-03-03'
Another option, you could use the difference between two row_numbers approach as the following:
select [Ticker], count(*)-1 NumSequentialDaysAction -- you could use (distinct) to remove duplicate rows
from
(
select *,
row_number() over (partition by [Ticker] order by [Date]) -
row_number() over (partition by [Ticker], [Action] order by [Date]) grp
from UserTable
where [date] <= '2022-03-03'
) RN_Groups
/* get only rows where [Action] = last date [Action] */
where [Action] = (select top 1 [Action] from UserTable T
where T.[Ticker] = RN_Groups.[Ticker] and [date] <= '2022-03-03'
order by [Date] desc)
group by [Ticker], [Action], grp
See demo

Get the aggregated result of a GROUP BY for each value on WHERE clause in TSQL

I have a table in SQL Server with the following format
MType (Integer), MDate (Datetime), Status (SmallInt)
1, 10-05-2018, 1
1, 15-05-2018, 1
2, 25-3-2018, 0
3, 12-01-2018, 1
....
I want to get the MIN MDate for specific MTypes for future dates. In case there isn't one, then the MType should be returned but with NULL value.
Here is what I have done until now:
SELECT m.MType,
MIN(m.MDate)
FROM MyTypes m
WHERE m.MType IN ( 1, 2, 3, 4)
AND m.MDate > GETDATE()
AND m.Status = 1
GROUP BY m.MType
Obviously, the above will return only the following:
1, 10-05-2018
Since there are any other rows with future date and status equals to 1.
However, the results I want are:
1, 10-05-2018
2, NULL
3, NULL
4, NULL //this is missing in general from the table. No MType with value 4
The table is big, so performance is something to take into account. Any ideas how to proceed?
One way is to join the table to itself and filter the date in the ON clause.
SELECT a.Mtype, MIN(b.MDate)
FROM MyTypes a
LEFT JOIN MyTypes b
ON a.MType = b.MType
AND b.MDate > GETDATE()
AND b.Status = 1
WHERE a.MType IN ( 1, 2, 3)
GROUP BY a.MType
Here's a Demo.
I don't know what is logic behind but it seems to use of look-up tables
SELECT a.MType, l.MDate
FROM
(
values (1),(2),(3),(4)
)a (MType)
LEFT JOIN (
SELECT m.MType,
MIN(m.MDate) MDate
FROM MyTypes m
WHERE m.MDate > GETDATE()
AND m.Status = 1
GROUP BY m.MType
)l on l.MType = a.MType
Use a windows function and a union to a numbers table:
declare #t table (MType int, MDate datetime, [Status] smallint)
Insert into #t values (1, convert(date, '10-05-2018', 103), 1)
,(1, convert(date, '15-05-2018', 103), 1)
,(2, convert(date, '25-03-2018', 103), 0)
,(3, convert(date, '12-01-2018', 103), 1)
Select DISTINCT Mtype
, min(iiF(MDate>getdate() and status = 1, MDate, NUll)) over (Partition By Mtype) as MDate
from ( SELECT TOP 10000 row_number() over(order by t1.number) as MType
, '1900-01-01' as MDate, 0 as [Status]
FROM master..spt_values t1
CROSS JOIN master..spt_values t2
union
Select Mtype, MDate, [Status] from #t
) x
where MType in (1,2,3,4)
order by x.MType

SQL server to do like Group By task

I have a table with SQL server as below,
Date Value
---------------------------------------------------
08-01-2016 1
08-02-2016 1
08-03-2016 1
08-04-2016 1
08-05-2016 1
08-06-2016 2
08-07-2016 2
08-08-2016 2
08-09-2016 2.5
08-10-2016 1
08-11-2016 1
Since the original table is too large, even I used 'Results to file', it still raise the exception 'System.OutOfMemoryException'. That's why I want to organize the table into this kind.
But I don't have a good logic to deal with. Therefore, I want to change the table into this kind as below.
Date_from Date_to Value
-------------------------------------------------
08-01-2016 08-05-2016 1
08-06-2016 08-08-2016 2
08-09-2016 08-09-2016 2.5
08-10-2016 08-11-2016 1
I appreciate your ideas!
Commonly called as Groups and Island problem. Here is one trick to do this
;WITH data
AS (SELECT *,Lag(Value, 1)OVER(ORDER BY Dates) [pVal]
FROM (VALUES ('08-01-2016',1 ),
('08-02-2016',1 ),
('08-03-2016',1 ),
('08-04-2016',1 ),
('08-05-2016',1 ),
('08-06-2016',2 ),
('08-07-2016',2 ),
('08-08-2016',2 ),
('08-09-2016',2.5 ),
('08-10-2016',1 ),
('08-11-2016',1 )) tc (Dates, Value)),
intr
AS (SELECT Dates,
Value,
Sum(Iif(pVal = Value, 0, 1)) OVER(ORDER BY Dates) AS [Counter]
FROM data)
SELECT Min(Dates) AS Dates_from,
Max(Dates) AS Dates_to,
Value
FROM intr
GROUP BY [Counter],
Value
The cumulative sum/lag approach is one method. In this case, a simpler method is:
select min(date) as date_from, max(date) as date_to, value
from (select t.*,
dateadd(day, - row_number() over (partition by value order by date),date) as grp
from t
) t
group by value, grp;
This uses the observation that the dates are consecutive with no gaps. Hence, subtracting a sequence from the date will yield a constant -- when the values are the same.
Here is an example:
DECLARE #T TABLE (
[Date] DATE,
[Value] DECIMAL(9,2)
)
INSERT #T VALUES
( '08-01-2016', 1 ),
( '08-02-2016', 1 ),
( '08-03-2016', 1 ),
( '08-04-2016', 1 ),
( '08-05-2016', 1 ),
( '08-06-2016', 2 ),
( '08-07-2016', 2 ),
( '08-08-2016', 2 ),
( '08-09-2016', 2.5 ),
( '08-10-2016', 1 ),
( '08-11-2016', 1 )
SELECT * FROM #T
SELECT A.[Date] StartDate, B.[Date] EndDate, A.[Value] FROM (
SELECT A.*, ROW_NUMBER() OVER (ORDER BY A.[Date], A.[Value]) O FROM #T A
LEFT JOIN #T B ON B.[Value] = A.[Value] AND B.[Date] = DATEADD(d, -1, A.[Date])
WHERE B.[Date] IS NULL
) A
JOIN (
SELECT A.*, ROW_NUMBER() OVER (ORDER BY A.[Date], A.[Value]) O FROM #T A
LEFT JOIN #T B ON B.[Value] = A.[Value] AND B.[Date] = DATEADD(d, 1, A.[Date])
WHERE B.[Date] IS NULL
) B ON B.O = A.O
Prdp's solution is great but just in case if anyone is still using SQL Server 2008 where LAG() and The Parallel Data Warehouse (PDW) features are not available here is an alternative:
SAMPLE DATA:
IF OBJECT_ID('tempdb..#Temp') IS NOT NULL
DROP TABLE #Temp;
CREATE TABLE #Temp([Dates] DATE
, [Value] FLOAT);
INSERT INTO #Temp([Dates]
, [Value])
VALUES
('08-01-2016'
, 1),
('08-02-2016'
, 1),
('08-03-2016'
, 1),
('08-04-2016'
, 1),
('08-05-2016'
, 1),
('08-06-2016'
, 2),
('08-07-2016'
, 2),
('08-08-2016'
, 2),
('08-09-2016'
, 2.5),
('08-10-2016'
, 1),
('08-11-2016'
, 1);
QUERY:
;WITH Seq
AS (SELECT SeqNo = ROW_NUMBER() OVER(ORDER BY [Dates]
, [Value])
, t.Dates
, t.[Value]
FROM #Temp t)
SELECT StartDate = MIN([Dates])
, EndDate = MAX([Dates])
, [Value]
FROM
(SELECT [Value]
, [Dates]
, SeqNo
, rn = SeqNo - ROW_NUMBER() OVER(PARTITION BY [Value] ORDER BY SeqNo)
FROM Seq s) a
GROUP BY [Value]
, rn
ORDER BY StartDate;
RESULTS:

Group records only if it have intersected periods

I have table like this
declare #data table
(
id int not null,
groupid int not null,
startDate datetime not null,
endDate datetime not null
)
insert into #data values
(1, 1, '20150101', '20150131'),
(2, 1, '20150114', '20150131'),
(3, 1, '20150201', '20150228');
and my current selecting statement is:
select groupid, 'some data', min(id), count(*)
from #data
group by groupid
But now I need to group records if it have intersected periods
desired result:
1, 'some data', 1, 2
1, 'some data', 3, 1
Is someone know how to do this?
One method is to identify the beginning of each group -- because it doesn't overlap with the previous one. Then, count the number of these as a group identifier.
with overlaps as (
select id
from #data d
where not exists (select 1
from #data d2
where d.groupid = d2.groupid and
d.startDate >= d2.startDate and
d.startDate < d2.endDate
)
),
groups as (
select d.*,
count(o.id) over (partition by groupid
order by d.startDate) as grpnum
from #data d left join
overlaps o
on d.id = o.id
)
select groupid, min(id), count(*),
min(startDate) as startDate, max(endDate) as endDate
from groups
group by grpnum, groupid;
Notes: This is using cumulative counts, which are available in SQL Server 2012+. You can do something similar with a correlated subquery or apply in earlier versions.
Also, this query assumes that the start dates are unique. If they are not, the query can be tweaked, but the logic becomes a bit more complicated.

Only select records that do not start within the time frame of another record

I'm trying to achieve the following goal using MS SQL Server 2005 but do not know how to do it.
The goal is to select only records that do not start within the same time period as an anchor record.
Rows that have same ID are a group and evaluated as part of that group.
Start with the earliest date (A) based on StartDate, compare to the next row (B) that has the same ID.
If B starts within A, mark B as invalid. Continue to compare A against all remaining records that have the same ID. Mark any starting within A as invalid.
Flag the next record that does not overlap with A as Valid. Now repeat the same process as above (i.e. check to see if any subsequent records start within the time frame of the new valid record).
Repeat this process until all records have been analyzed.
Example: Create the following table.
if object_id ('tempdb..#Dates') is not null drop table #Dates
create table #Dates (ID int, StartDate datetime, EndDate datetime)
Insert into #Dates
Select 1, '7/23/2003' , '8/22/2003' union all
select 1, '8/21/2003' , '11/19/2003' union all
select 1, '11/18/2003' , '12/18/2003' union all
select 1, '12/17/2003' , '1/16/2004' union all
select 1, '1/15/2004' , '2/14/2004' union all
select 1, '2/11/2004' , '2/26/2004' union all
select 1, '9/14/2004' , '10/14/2004' union all
select 1, '10/5/2004' , '10/20/2004' union all
select 1, '11/20/2004' , '12/20/2004' union all
select 1, '12/19/2004' , '1/18/2005' union all
select 1, '1/12/2005' , '1/27/2005' union all
select 1, '2/27/2005' , '3/11/2005'
Expected output after applying the overlap logic rules:
ID StartDate EndDate Valid
-- --------- --------- -----
1 7/23/2003 8/22/2003 1
1 8/21/2003 11/19/2003 0
1 11/18/2003 12/18/2003 1
1 12/17/2003 1/16/2004 0
1 1/15/2004 2/14/2004 1
1 2/11/2004 2/26/2004 0
1 9/14/2004 10/14/2004 1
1 10/5/2004 10/20/2004 0
1 11/20/2004 12/20/2004 1
1 12/19/2004 1/18/2005 0
1 1/12/2005 1/27/2005 1
1 2/27/2005 3/11/2005 1
I figured out how to answer my own question. Used recursive SQL after ordering the records using row_number.
if object_id ('tempdb..#Dates') is not null drop table #Dates
create table #Dates (ID int, StartDate datetime, EndDate datetime)
Insert into #Dates
Select 1, '7/23/2003' , '8/22/2003' union all
select 1, '8/21/2003' , '11/19/2003' union all
select 1, '11/18/2003' , '12/18/2003' union all
select 1, '12/19/2004' , '1/18/2005' union all
select 1, '1/12/2005' , '1/27/2005' union all
select 1, '2/27/2005' , '3/11/2005' union all
select 1, '12/17/2003' , '1/16/2004' union all
select 1, '1/15/2004' , '2/14/2004' union all
select 1, '2/11/2004' , '2/26/2004' union all
select 1, '9/14/2004' , '10/14/2004' union all
select 1, '10/5/2004' , '10/20/2004' union all
select 1, '11/20/2004' , '12/20/2004'
--Phase 1: Apply ordering to dates
if object_id ('tempdb..#OrderedRecords') is not null drop table #OrderedRecords
select *, N = row_number () over (partition by ID order by StartDate asc, EndDate desc)
into #OrderedRecords
from #Dates
--Phase 2: Apply Overlap Rules (Subsume records that overlap)
;with Subsume (ID, N, StartDate, EndDate, IntermediateStartDate, IntermediateEndDate, Valid) as
(
select ID, N, StartDate, EndDate, IntermediateStartDate = StartDate, IntermediateEndDate = EndDate,
Valid = 1
from #OrderedRecords
where N = 1
UNION ALL
select c.ID, c.N, y.StartDate, y.EndDate,
IntermediateStartDate = case when c.StartDate between y.IntermediateStartDate and y.IntermediateEndDate then y.IntermediateStartDate else c.StartDate end,
IntermediateEndDate = case when c.StartDate between y.IntermediateStartDate and y.IntermediateEndDate then y.IntermediateEndDate else c.EndDate end,
Valid = case when (c.StartDate between y.IntermediateStartDate and y.IntermediateEndDate) then 0 else 1 end
from #OrderedRecords c
join Subsume y
on y.ID = c.ID
and y.N = c.n - 1
and y.IntermediateStartDate >= c.EndDate
UNION ALL
select c.ID, c.N, c.StartDate, c.EndDate,
IntermediateStartDate = case when c.StartDate between y.IntermediateStartDate and y.IntermediateEndDate then y.IntermediateStartDate else c.StartDate end,
IntermediateEndDate = case when c.StartDate between y.IntermediateStartDate and y.IntermediateEndDate then y.IntermediateEndDate else c.EndDate end,
Valid = case when (c.StartDate between y.IntermediateStartDate and y.IntermediateEndDate) then 0 else 1 end
from #OrderedRecords c
join Subsume y
on y.ID = c.ID
and y.N = c.n - 1
and y.IntermediateStartDate < c.EndDate
)
Select ID, StartDate, EndDate, Valid
from Subsume
OPTION (MAXRECURSION 0)