TSQL - GROUP BY on continous rows only

TSQL - GROUP BY on continous rows only - sql

I am working on a SQL Server 2017 (v14.0).
I have a table like this:
Key | State | from | until |
----+----------+------------+------------+
100 | open | 01.01.2021 | 01.01.2021 |
100 | open | 02.01.2021 | 02.01.2021 |
100 | closed | 03.01.2021 | 13.01.2021 |
100 | open | 14.01.2021 | 20.01.2021 |
100 | open | 20.01.2021 | 30.01.2021 |
I want to group it by Key and State, but only for continuous rows.
So my expected result would be something like:
Key | State | from | until |
----+----------+------------+------------+
100 | open | 01.01.2021 | 02.01.2021 |
100 | closed | 03.01.2021 | 13.01.2021 |
100 | open | 14.01.2021 | 30.01.2021 |
Any idea on how to do this? I have the strong feeling, that this should be possible with the help of ROW_NUMBER somehow, but I was not able to figure it out yet...
(In this example data some weird group by calendarweek or something similar might be possible, but this is not my intention)

It is a Gaps and Islands problem. One solution is this:
WITH cte1 AS (
SELECT *, CASE WHEN LAG([state]) OVER (PARTITION BY [key] ORDER BY [from]) = [state] THEN 0 ELSE 1 END AS chg
FROM t
), cte2 AS (
SELECT *, SUM(chg) OVER (PARTITION BY [key] ORDER BY [from]) AS grp
FROM cte1
)
SELECT [key], grp, MIN([state]), MIN([from]), MAX([until])
FROM cte2
GROUP BY [key], grp
ORDER BY [key], grp

One possibility is as below added:
Create table myTable_o1
(
[key] int
,[state] varchar(100)
,[From] date
,[Until] date
)
insert into myTable_o1 values (100, 'open', '2021-01-01','2021-01-01')
insert into myTable_o1 values (100, 'open', '2021-01-02','2021-01-02')
insert into myTable_o1 values (100, 'closed', '2021-01-03','2021-01-13')
insert into myTable_o1 values (100, 'open', '2021-01-4','2021-01-20')
insert into myTable_o1 values (100, 'open', '2021-01-20','2021-01-30')
SELECT
[key]
,[state]
,[From]
,[until]
FROM
(
Select
[key]
, [state]
, [From]
, row_number() over (partition by tiles order by [key]) row_num
, ISNULL(Lead(Until) over (partition by tiles order by [key]) , Until) [until]
FROM
(
SELECT * ,
Ntile(2) over ( order by [Key]) as [tiles]
from myTable_o1
) AS A
) AS B WHERE B.row_num in (1,3)

Related

SQL return second max date for each id, date and channel

I have the following table:
id channel_id date
1 | 1 | 2017-01-10
1 | 2 | 2018-02-05
1 | 1 | 2019-03-07
1 | 2 | 2020-03-15
2 | 1 | 2018-01-17
2 | 1 | 2019-07-20
2 | 1 | 2020-01-10
I want to return for previous maximum date for each date and id but two separate columns for both channel_id. So, one column for previous max date for channel_id is equal to 1 and another for previous max date for channel_id is equal to 2. What I want to get can be found below:
id channel_id date prev_date_channel_id1 prev_date_channel_id2
1 | 1 | 2017-01-10 | NULL | NULL |
1 | 2 | 2018-02-05 | 2017-01-10 | NULL |
1 | 1 | 2019-03-07 | 2017-01-10 | 2018-02-05 |
1 | 2 | 2020-03-15 | 2019-03-07 | 2018-02-05 |
2 | 1 | 2018-01-17 | NULL | NULL |
2 | 1 | 2019-07-20 | 2018-01-17 | NULL |
2 | 1 | 2020-01-10 | 2019-07-20 | NULL |
I made a query as below and returns what I want but takes too much time. I'd appreciate any optimization suggestions!
SELECT
a.id,
a.date,
MAX(c.date) AS prev_date_channel_id1,
MAX(d.date) AS prev_date_channel_id2
FROM
table a
LEFT JOIN
table c ON a.id=c.id AND a.date>c.date AND c.channel_id=1
LEFT JOIN
table d ON a.id=d.id AND a.date>d.date AND d.channel_id=2
GROUP BY a.id, a.date

Use lag() for the previous date and a cumulative conditional max for the channel 2 date:
select t.*, lag(date) over (partition by id order by date) as prev_date,
max(case when channel = 2 then date end) over
(partition by id
order by date
rows between unbounded preceding and 1 row preceding
) as prev_date_channel2
from t;

I think there's an error in your "expected output" for the value of prev_date_channel_id1 on the last row (it should be 2019-07-20).
In any case, with appropriate indexing an outer apply top 1 construct might serve you better:
create table t
(
id int,
channel_id int,
[date] date
constraint pk_t primary key clustered (id, channel_id, [date])
);
insert t values
(1, 1, '2017-01-10'),
(1, 2, '2018-02-05'),
(1, 1, '2019-03-07'),
(1, 2, '2020-03-15'),
(2, 1, '2018-01-17'),
(2, 1, '2019-07-20'),
(2, 1, '2020-01-10');
select t1.id,
t1.channel_id,
t1.[date],
prev_date_channel_id1 = c1.dt,
prev_date_channel_id2 = c2.dt
from t t1
outer apply (
select top 1 [date]
from t
where id = t1.id
and channel_id = 1
and [date] < t1.[date]
order by date desc
) c1(dt)
outer apply (
select top 1 [date]
from t
where id = t1.id
and channel_id = 2
and [date] < t1.[date]
order by date desc
) c2(dt)
order by t1.id, t1.[date];
Or possibly faster still, especially with the key changed to constraint pk_t primary key clustered (id, [date], [channel_id]))
select t1.id,
t1.channel_id,
t1.[date],
prev_date_channel_id1 = prev.c1,
prev_date_channel_id2 = prev.c2
from t t1
outer apply (
select c1 = max(iif(channel_id = 1, [date], null)),
c2 = max(iif(channel_id = 2, [date], null))
from t
where id = t1.id
and [date] < t1.[date]
) prev

Assuming you have an index on those three columns, you can use subqueries:
SELECT [T0].[id],
[T0].[channel_id],
[T0].[date],
[prev_date_channel_id1] = (
SELECT MAX([T1].[date])
FROM [t] [T1]
WHERE [T1].[id] = [T0].[id]
AND [T1].[date] < [T0].[date]
AND [T1].[channel_id] = 1
),
[prev_date_channel_id2] = (
SELECT MAX([T1].[date])
FROM [t] [T1]
WHERE [T1].[id] = [T0].[id]
AND [T1].[date] < [T0].[date]
AND [T1].[channel_id] = 2
)
FROM [t] [T0];

SQL - Choose which duplicate value to select

I have a table with a datetime, value and user.
This table has multiple rows for the same datetime but with a different user and value.
I want select distinct datetime with the corresponding value and user.
Where there is a duplicate datetime with different users, the value that user2 has input should be prioritised.
Table 1
-----------------
DateTime| Value| User
--------|---------|---------
1/1/17 | 10| User1
2/1/17 | 30| User1
3/1/17 | 10| User1
1/1/17 | 90| User2
2/1/17 | 80| User2
So from the above, I would end up with
1/1/17 | 90| User2
2/1/17 | 80| User2
3/1/17 | 10| User1
I'm sure there is a simple answer to this but I can't for the life of me work out how to do it!
Any help greatly appreciated.
Thanks

Not quite simple! Using window functions and common table expressions
; with x as (
select [DateTime], value, [User], row_num = row_number() over(partition by [DateTime] order by [User] desc) from Table1
)
select x.* from x where row_num = 1

This will always prioritize the input from 'User2', even when there is input from 'User1' and 'User3'.
;with cte as (
select *
, rn = row_number() over (
partition by [DateTime]
order by (case when [user] = 'User2' then 0 else 1 end) asc
)
from t
)
select *
from cte
where rn=1
rextester http://rextester.com/AZVA85684
results:
+----------+-------+-------+----+
| DateTime | value | user | rn |
+----------+-------+-------+----+
| 1/1/17 | 90 | User2 | 1 |
| 2/1/17 | 80 | User2 | 1 |
| 3/1/17 | 10 | User1 | 1 |
+----------+-------+-------+----+

DECLARE #T as table
(
[DateTime] nvarchar(100),
VALUE INT,
[USER] VARCHAR(32)
)
INSERT INTO #T
VALUES
('1/1/17', 10, 'User1'),
('2/1/17', 30, 'User1'),
('3/1/17', 10, 'User1'),
('1/1/17', 90, 'User2'),
('2/1/17', 80, 'User2')
SELECT t.[DateTime], t.VALUE, t.[USER]
FROM #T t
JOIN (
SELECT [DateTime], MAX([USER]) AS [USER]
FROM #T
GROUP BY [DateTime]
) u ON u.[DateTime] = t.[DateTime] AND u.[USER] = t.[USER]
ORDER BY VALUE DESC

;with cte
as
(
select
*,
row_number() over (partition by date order by replace(user,'user','') desc) as rownum
from
#temp
)
select * from cte where rownum=1

Select invoices based on quantity needed

I have a table that looks like this:
+---------------+---------------+------------------+--------------+
| InvoiceNumber | ProductNumber | ReceivedQuantity | ReceivedDate |
+---------------+---------------+------------------+--------------+
| INV001 | P001 | 500 | 09/01/2015 |
| INV002 | P001 | 600 | 09/02/2015 |
| INV003 | P001 | 700 | 09/03/2015 |
+---------------+---------------+------------------+--------------+
When a product is ordered. System needs to know which invoice it gets it from. First in first out.
For example I need 1000 quantity of product number P001. It should select the following invoices. It does not display the last invoice since 500 + 600 is already sufficient quantity
+---------------+---------------+------------------+--------------+
| InvoiceNumber | ProductNumber | ReceivedQuantity | ReceivedDate |
+---------------+---------------+------------------+--------------+
| INV001 | P001 | 500 | 09/01/2015 |
| INV002 | P001 | 600 | 09/02/2015 |
+---------------+---------------+------------------+--------------+
I can replicate this by making a cursor and looping through the table but looking for the best way to achieve this. Any nudge to the right direction would help a lot.

I think you can use a query like this:
;WITH t As (
SELECT *
, ROW_NUMBER() OVER (ORDER BY ReceivedDate, InvoiceNumber) As RowNo
FROM yourTable
), firstOverflow AS (
SELECT TOP(1)
t1.RowNo
FROM t t1
LEFT JOIN
t t2 ON t1.ProductNumber = t2.ProductNumber AND t1.ReceivedDate >= t2.ReceivedDate
GROUP BY t1.RowNo, t1.InvoiceNumber, t1.ProductNumber, t1.ReceivedQuantity, t1.ReceivedDate
HAVING SUM(t2.ReceivedQuantity) >= 1000
ORDER BY SUM(t2.ReceivedQuantity) - 1000)
SELECT *
FROM t
JOIN
firstOverflow ON t.RowNo <= firstOverflow.RowNo;
A better solution is this:
DECLARE #value int = 1000;
WITH t As (
SELECT *
, ROW_NUMBER() OVER (ORDER BY ReceivedDate, InvoiceNumber) As seq
FROM yourTable
), s As (
SELECT t.InvoiceNumber, t.ProductNumber, t.ReceivedQuantity, t.ReceivedDate, SUM(tt.ReceivedQuantity) As currentTotal
FROM t
LEFT JOIN
t tt ON t.ProductNumber = tt.ProductNumber AND t.seq >= tt.seq
GROUP BY t.InvoiceNumber, t.ProductNumber, t.ReceivedQuantity, t.ReceivedDate
), st As (
SELECT *
, ROW_NUMBER() OVER (ORDER BY (CASE WHEN s.currentTotal > #value THEN -currentTotal ELSE Null END) DESC) As seq
FROM s)
SELECT st.InvoiceNumber, st.ProductNumber, st.ReceivedQuantity, st.ReceivedDate
FROM st
WHERE currentTotal < #value
UNION ALL
SELECT st.InvoiceNumber, st.ProductNumber, st.ReceivedQuantity, st.ReceivedDate
FROM st
WHERE currentTotal >= #value AND st.seq = 1;

Try this query and give some feedback:
DECLARE #table TABLE (InvoiceNumber nvarchar(100),
ProductNumber nvarchar(100),
ReceivedQuantity int)
INSERT INTO #table VALUES ('inv001', 'p001', 500)
INSERT INTO #table VALUES ('inv002', 'p001', 600)
INSERT INTO #table VALUES ('inv003', 'p001', 600)
INSERT INTO #table VALUES ('inv004', 'p001', 600)
SQL 2012:
SELECT v.* FROM
(
SELECT t.*,
SUM(ReceivedQuantity) OVER (PARTITION BY ProductNumber ORDER BY InvoiceNumber) AS sum
FROM #table t
) v
WHERE sum <= 1000
SQL 2008:
SELECT v.* FROM
(
SELECT
a.InvoiceNumber
, a.ProductNumber
, SUM(b.ReceivedQuantity) AS sum
FROM
#table a
INNER JOIN #table b
ON a.InvoiceNumber >= b.InvoiceNumber AND a.ProductNumber = b.ProductNumber
GROUP BY
a.InvoiceNumber
, a.ProductNumber
) v
WHERE sum <= 1000

Selecting records with maximum value in group

I have a transaction table with the following structure:
select t.[GUID], t.[ID], ts.Description "Status", t.Payee, t.Amount, t.SequenceNumber
from [Transaction] t
inner join TransactionStatus ts on t.StatusID = ts.ID
GUID | ID | Status | Payee | Amount | SequenceNumber
AF732CF5-E6C0-E411-B8F6-004056AB77C2 | 1 | Posted | Amy | 500.00 | 1
AF732CF5-E6C0-E411-B8F6-004056AB77C2 | 2 | Voided | Amy | 500.00 | 2
1F7D880C-E7C0-E411-B8F6-004056AB77C2 | 3 | Posted | Bob | 70.00 | 1
AF732CF5-E6C0-E411-B8F6-004056AB77C2 | 4 | Posted | Amy | 512.50 | 3
1F7D880C-E7C0-E411-B8F6-004056AB77C2 | 5 | Posted | Bob | 66.00 | 2
F2CC0B03-76C7-E411-A48D-004056AB787C | 6 | Pending | Carol | 240.00 | NULL
I'm trying to construct a query to group the records by GUID and select the single record with the largest SequenceNumber (if it isn't NULL):
GUID | ID | Status | Payee | Amount | SequenceNumber
AF732CF5-E6C0-E411-B8F6-004056AB77C2 | 4 | Posted | Amy | 512.50 | 3
1F7D880C-E7C0-E411-B8F6-004056AB77C2 | 5 | Posted | Bob | 66.00 | 2
F2CC0B03-76C7-E411-A48D-004056AB787C | 6 | Pending | Carol | 240.00 | NULL
I've tried adding this line:
where SequenceNumber = (select MAX(SequenceNumber) from [Transaction] t2 where t.[GUID] = t2.[GUID])
but that doesn't get me any transactions where the status is Pending (they don't have sequence numbers). How can I fix this query?

If it's SQL-Server you can use a CTE + ROW_NUMBER:
WITH CTE AS
(
select t.[GUID], t.[ID], ts.Description "Status", t.Payee, t.Amount, t.SequenceNumber,
rn = row_number() over (partition by t.[GUID] Order By t.SequenceNumber DESC)
from [Transaction] t
inner join TransactionStatus ts on t.StatusID = ts.ID
)
SELECT GUID, ID, Status, Payee, Amount, SequenceNumber
FROM CTE
WHERE rn = 1
This will include the row where SequenceNumber is null. If you want all rows with the maximum SequenceNumber(in case of ties) use DENSE_RANK instead of ROW_NUMBER.

You can calculate the MAX(ID) and it's related [GUID] in a subquery and JOIN to it in order to get the desired results:
Sample subquery:
SELECT [GUID] ,
MAX(ID) MaxId
FROM Transaction
GROUP BY [GUID]
Would produce:
GUID MaxId
1F7D880C-E7C0-E411-B8F6-004056AB77C2 5
AF732CF5-E6C0-E411-B8F6-004056AB77C2 4
F2CC0B03-76C7-E411-A48D-004056AB787C 6
Full Demo:
CREATE TABLE #Transaction
(
[GUID] VARCHAR(36) ,
[ID] INT ,
[Status] VARCHAR(7) ,
[Payee] VARCHAR(5) ,
[Amount] INT ,
[SequenceNumber] VARCHAR(4)
);
INSERT INTO #Transaction
( [GUID], [ID], [Status], [Payee], [Amount], [SequenceNumber] )
VALUES ( 'AF732CF5-E6C0-E411-B8F6-004056AB77C2', 1, 'Posted', 'Amy', 500.00,
'1' ),
( 'AF732CF5-E6C0-E411-B8F6-004056AB77C2', 2, 'Voided', 'Amy', 500.00,
'2' ),
( '1F7D880C-E7C0-E411-B8F6-004056AB77C2', 3, 'Posted', 'Bob', 70.00,
'1' ),
( 'AF732CF5-E6C0-E411-B8F6-004056AB77C2', 4, 'Posted', 'Amy', 512.50,
'3' ),
( '1F7D880C-E7C0-E411-B8F6-004056AB77C2', 5, 'Posted', 'Bob', 66.00,
'2' ),
( 'F2CC0B03-76C7-E411-A48D-004056AB787C', 6, 'Pending', 'Carol',
240.00, NULL );
SELECT #Transaction.*
FROM #Transaction
INNER JOIN ( SELECT [GUID] ,
MAX(ID) MaxId
FROM #Transaction
GROUP BY [GUID]
) t ON t.[GUID] = #Transaction.[GUID]
AND t.MaxId = #Transaction.ID
ORDER BY ID

Try this way to get maximum SequenceNumber
CASE WHEN MAX(SequenceNumber IS NULL) = 0 THEN MAX(SequenceNumber) ELSE NULL END AS SequenceNumber

I don't know if SQL Server has windowing functions, so you may be able to do this more cleanly, but here's a vanilla SQL solution:
select highest.[GUID],
highest.[ID],
ts.Description "Status",
highest.Payee,
highest.Amount,
highest.SequenceNumber
from [Transaction] highest
join TransactionStatus ts
on ts.ID = highest.ID
left join [Transaction] higher
on higher.[GUID] = highest.[GUID]
and higher.SequenceNumber > highest.SequenceNumber
where higher.[GUID] is null;

omething like this:
SELECT * FROM
(
select
t.[GUID], t.[ID], ts.Description "Status", t.Payee, t.Amount,
ROW_NUMBER() OVER PARTITION BY (t.[GUID]
ORDER BY t.SequenceNumber DESC) AS rownum
from [Transaction] t
inner join TransactionStatus ts on t.StatusID = ts.ID
)vals where vals.rownum = 1

Selecting most recent record for each group with a sum function

This is a sample table
ID | Serial | Quantity | Date_Created
-------------------------------------
1 | AS1GD | 10 | 2014-12-25 8:00:00 AM
1 | GO9A4 | 5 | 2014-12-28 9:04:32 AM
2 | JF8WS | 15 | 2014-12-29 9:23:43 AM
2 | JFLE0 | 15 | 2015-01-04 10:53:12 AM
2 | S8A4A | 10 | 2015-01-05 9:12:46 AM
3 | FXOE3 | 20 | 2015-01-03 9:31:52 AM
3 | LSOR9 | 22 | 2015-01-06 12:00:44 PM
My expected result
ID | Serial | Total_Quantity | Last_DateCreated
-------------------------------------------------
1 | GO9A4 | 15 | 2014-12-28 9:04:32 AM
2 | S8A4A | 40 | 2015-01-05 9:12:46 AM
3 | LSOR9 | 42 | 2015-01-06 12:00:44 PM
Here's a query I tried but it's not returning the sum but only the quantity of the record
WITH total AS
( SELECT [ID], [date_created], [serial], sum(quantity) as qty,
ROW_NUMBER() OVER (PARTITION BY [ID] ORDER BY [date_created] DESC) AS rownum
FROM [table]
group by ID, date_created, serial
)
SELECT ID, Serial, qty, date_created
FROM total
WHERE rownum = 1

Since you are grouping by more than the ID but want the SUM() at the ID level, you can add OVER() to your SUM():
;WITH total AS ( SELECT [ID]
, [date_created]
, [serial]
, SUM(SUM(quantity)) OVER(PARTITION BY [ID]) as qty
, ROW_NUMBER() OVER (PARTITION BY [ID] ORDER BY [date_created] DESC) AS rownum
FROM [table]
GROUP BY ID, date_created, serial
)
SELECT ID, Serial, qty, date_created
FROM total
WHERE rownum = 1
The above creates an oddity in which you need two SUM() in order to use the OVER(), but you can ditch the GROUP BY altogether in your example:
;WITH total AS ( SELECT [ID]
, [date_created]
, [serial]
, SUM(quantity) OVER(PARTITION BY [ID]) as qty
, ROW_NUMBER() OVER (PARTITION BY [ID] ORDER BY [date_created] DESC) AS rownum
FROM Table1
)
SELECT ID, Serial, qty, date_created
FROM total
WHERE rownum = 1
Demo: SQL Fiddle

This will work as long as you don't have two records with the same ID created in the same second:
WITH RecentSUM AS
(
SELECT ID, MAX(DateCreated) DateCreated, SUM(Quantity) TotalQuantity
FROM [table]
GROUP BY ID
)
SELECT t.ID, t.Serial, r.TotalQuantity, r.DateCreated
FROM RecentSUM r
INNER JOIN [table] t ON t.ID = r.ID and t.DateCreated=r.DateCreated;

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

TSQL - GROUP BY on continous rows only - sql

Related

SQL return second max date for each id, date and channel

SQL - Choose which duplicate value to select

Select invoices based on quantity needed

Selecting records with maximum value in group

Selecting most recent record for each group with a sum function

Categories

Resources