Group data by the change of grouping column value in order - sql

With the following data
create table #ph (product int, [date] date, price int)
insert into #ph select 1, '20120101', 1
insert into #ph select 1, '20120102', 1
insert into #ph select 1, '20120103', 1
insert into #ph select 1, '20120104', 1
insert into #ph select 1, '20120105', 2
insert into #ph select 1, '20120106', 2
insert into #ph select 1, '20120107', 2
insert into #ph select 1, '20120108', 2
insert into #ph select 1, '20120109', 1
insert into #ph select 1, '20120110', 1
insert into #ph select 1, '20120111', 1
insert into #ph select 1, '20120112', 1
I would like to produce the following output:
product | date_from | date_to | price
1 | 20120101 | 20120105 | 1
1 | 20120105 | 20120109 | 2
1 | 20120109 | 20120112 | 1
If I group by price and show the max and min date then I will get the following which is not what I want (see the over lapping of dates).
product | date_from | date_to | price
1 | 20120101 | 20120112 | 1
1 | 20120105 | 20120108 | 2
So essentially what I'm looking to do is group by the step change in data based on group columns product and price.
What is the cleanest way to achieve this?

There's a (more or less) known technique of solving this kind of problem, involving two ROW_NUMBER() calls, like this:
WITH marked AS (
SELECT
*,
grp = ROW_NUMBER() OVER (PARTITION BY product ORDER BY date)
- ROW_NUMBER() OVER (PARTITION BY product, price ORDER BY date)
FROM #ph
)
SELECT
product,
date_from = MIN(date),
date_to = MAX(date),
price
FROM marked
GROUP BY
product,
price,
grp
ORDER BY
product,
MIN(date)
Output:
product date_from date_to price
------- ---------- ------------- -----
1 2012-01-01 2012-01-04 1
1 2012-01-05 2012-01-08 2
1 2012-01-09 2012-01-12 1

I'm new to this forum so hope my contribution is helpful.
If you really don't want to use a CTE (although I think thats probably the best approach) you can get a solution using set based code. You will need to test the performance of this code!.
I have added in an extra temp table so that I can use a unique identifier for each record but I suspect you will already have this column in you source table. So heres the temp table.
If Exists (SELECT Name FROM tempdb.sys.tables WHERE name LIKE '#phwithId%')
DROP TABLE #phwithId
CREATE TABLE #phwithId
(
SaleId INT
, ProductID INT
, Price Money
, SaleDate Date
)
INSERT INTO #phwithId SELECT row_number() over(partition by product order by [date] asc) as SalesId, Product, Price, Date FROM ph
Now the main body of the Select statement
SELECT
productId
, date_from
, date_to
, Price
FROM
(
SELECT
dfr.ProductId
, ROW_NUMBER() OVER (PARTITION BY ProductId ORDER BY ChangeDate) AS rowno1
, ChangeDate AS date_from
, dfr.Price
FROM
(
SELECT
sl1.ProductId AS ProductId
, sl1.SaleDate AS ChangeDate
, sl1.price
FROM
#phwithId sl1
LEFT JOIN
#phwithId sl2
ON sl1.SaleId = sl2.SaleId + 1
WHERE
sl1.Price <> sl2.Price OR sl2.Price IS NULL
) dfr
) da1
LEFT JOIN
(
SELECT
ROW_NUMBER() OVER (PARTITION BY ProductId ORDER BY ChangeDate) AS rowno2
, ChangeDate AS date_to
FROM
(
SELECT
sl1.ProductId
, sl1.SaleDate AS ChangeDate
FROM
#phwithId sl1
LEFT JOIN
#phwithId sl3
ON sl1.SaleId = sl3.SaleId - 1
WHERE
sl1.Price <> sl3.Price OR sl3.Price IS NULL
) dto
) da2
ON da1.rowno1 = da2.rowno2
By binding the data source offset by 1 record (+or-) we can identify when the price buckets change and then its just a matter of getting the start and end dates for the buckets back into a single record.
All a bit fiddly and I'm not sure its going to give better performance but I enjoyed the challenge.

WITH marked AS (
SELECT
*,
case
when (lag(price,1,'') over (partition by product order by date_from)) = price
then 0 else 1
end is_price_change
FROM #ph
),
marked_as_group AS
( SELECT m.*,
SUM(is_price_change) over (PARTITION BY product order by date_from ROWS
BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS price_change_group
FROM marked m
),
SELECT
product,
date_from = MIN(date_from),
date_to = MAX(date_to),
price = MIN(price)
FROM marked_as_group
GROUP BY
product,
price_change_group
ORDER BY
product,
date_to

One solution I have come up with which is relatively "clean" is:
;with cte_sort (product, [date], price, [row])
as
(select product, [date], price, row_number() over(partition by product order by [date] asc) as row
from #ph)
select a.product, a.[date] as date_from, c.[date] as date_to, a.price
from cte_sort a
left outer join cte_sort b on a.product = b.product and (a.row+1) = b.row and a.price = b.price
outer apply (select top 1 [date] from cte_sort z where z.product = a.product and z.row > a.row order by z.row) c
where b.row is null
order by a.[date]
I used a CTE with row_number because you then don't need to worry about whether any dates are missing if you use functions like dateadd. You obviously only need the outer apply if you want to have the date_to column (which I do).
This solution does solve my problem, I am however having a slight issue getting it to perform as quickly as I'd like on my table of 5 million rows.

Create function [dbo].[AF_TableColumns](#table_name nvarchar(55))
returns nvarchar(4000) as
begin
declare #str nvarchar(4000)
select #str = cast(rtrim(ltrim(column_name)) as nvarchar(500)) + coalesce(' ' + #str , ' ')
from information_schema.columns
where table_name = #table_name
group by table_name, column_name, ordinal_position
order by ordinal_position DESC
return #str
end
--select dbo.AF_TableColumns('YourTable') Select * from YourTable

Related

First value in DATE minus 30 days SQL

I have bunch of data out of which I'm showing ID, max date and it's corresponding values (user id, type, ...). Then I need to take MAX date for each ID, substract 30 days and show first date and it's corresponding values within this date period.
Example:
ID Date Name
1 01.05.2018 AAA
1 21.04.2018 CCC
1 05.04.2018 BBB
1 28.03.2018 AAA
expected:
ID max_date max_name previous_date previous_name
1 01.05.2018 AAA 05.04.2018 BBB
I have working solution using subselects, but as I have quite huge WHERE part, refresh takes ages.
SUBSELECT looks like that:
(SELECT MIN(N.name)
FROM t1 N
WHERE N.ID = T.ID
AND (N.date < MAX(T.date) AND N.date >= (MAX(T.date)-30))
AND (...)) AS PreviousName
How'd you write the select?
I'm using TSQL
Thanks
I can do this with 2 CTEs to build up the dates and names.
SQL Fiddle
MS SQL Server 2017 Schema Setup:
CREATE TABLE t1 (ID int, theDate date, theName varchar(10)) ;
INSERT INTO t1 (ID, theDate, theName)
VALUES
( 1,'2018-05-01','AAA' )
, ( 1,'2018-04-21','CCC' )
, ( 1,'2018-04-05','BBB' )
, ( 1,'2018-03-27','AAA' )
, ( 2,'2018-05-02','AAA' )
, ( 2,'2018-05-21','CCC' )
, ( 2,'2018-03-03','BBB' )
, ( 2,'2018-01-20','AAA' )
;
Main Query:
;WITH cte1 AS (
SELECT t1.ID, t1.theDate, t1.theName
, DATEADD(day,-30,t1.theDate) AS dMinus30
, ROW_NUMBER() OVER (PARTITION BY t1.ID ORDER BY t1.theDate DESC) AS rn
FROM t1
)
, cte2 AS (
SELECT c2.ID, c2.theDate, c2.theName
, ROW_NUMBER() OVER (PARTITION BY c2.ID ORDER BY c2.theDate) AS rn
, COUNT(*) OVER (PARTITION BY c2.ID) AS theCount
FROM cte1
INNER JOIN cte1 c2 ON cte1.ID = c2.ID
AND c2.theDate >= cte1.dMinus30
WHERE cte1.rn = 1
GROUP BY c2.ID, c2.theDate, c2.theName
)
SELECT cte1.ID, cte1.theDate AS max_date, cte1.theName AS max_name
, cte2.theDate AS previous_date, cte2.theName AS previous_name
, cte2.theCount
FROM cte1
INNER JOIN cte2 ON cte1.ID = cte2.ID
AND cte2.rn=1
WHERE cte1.rn = 1
Results:
| ID | max_date | max_name | previous_date | previous_name |
|----|------------|----------|---------------|---------------|
| 1 | 2018-05-01 | AAA | 2018-04-05 | BBB |
| 2 | 2018-05-21 | CCC | 2018-05-02 | AAA |
cte1 builds the list of max_date and max_name grouped by the ID and then using a ROW_NUMBER() window function to sort the groups by the dates to get the most recent date. cte2 joins back to this list to get all dates within the last 30 days of cte1's max date. Then it does essentially the same thing to get the last date. Then the outer query joins those two results together to get the columns needed while only selecting the most and least recent rows from each respectively.
I'm not sure how well it will scale with your data, but using the CTEs should optimize pretty well.
EDIT: For the additional requirement, I just added in another COUNT() window function to cte2.
I would do:
select id,
max(case when seqnum = 1 then date end) as max_date,
max(case when seqnum = 1 then name end) as max_name,
max(case when seqnum = 2 then date end) as prev_date,
max(case when seqnum = 2 then name end) as prev_name,
from (select e.*, row_number() over (partition by id order by date desc) as seqnum
from example e
) e
group by id;

Find nearest SUM amount from table SQL SERVER

Query
Declare #table1 TABLE (accountno varchar(max), saved_amount decimal)
INSERT INTO #table1 VALUES
('001',25),
('002',5)
Declare #table2 TABLE (accountno varchar(max), payamount decimal,ilno int)
INSERT INTO #table2 VALUES
('001',10,1),
('001',10,2),
('001',10,3),
('001',10,4),
('002',10,1),
('002',10,2);
WITH aa
AS (
SELECT a.*
,b.ilno
,b.payamount
,SUM(payamount) OVER (
PARTITION BY a.accountno ORDER BY CAST(a.accountno AS INT)
,ilno
) AS total_amount
FROM #table1 a
LEFT JOIN #table2 b ON a.accountno = b.accountno
)
,bb
AS (
SELECT accountno
,MAX(ilno) AS ilno
FROM aa
WHERE saved_amount >= total_amount
GROUP BY accountno
)
SELECT a.* FROM aa a INNER JOIN bb b on a.accountno =b.accountno AND a.ilno = b.ilno
Result
accountno | saved_amount | ilno | payamount | total_amount
----------------------------------------------------------
001 | 25 | 2 | 10 | 20
Expected Result
accountno | saved_amount | ilno | payamount | total_amount
----------------------------------------------------------
001 | 25 | 2 | 10 | 20
002 | 5 | 1 | 10 | 10
What I want is
If saved_amount is less than the first ilno, then get the first ilno else
get the highest ilno where saved_amount>=total_amount
You have a running total that you compare with the saved amount. You want the highest running total that doesn't exceed the saved amount. But in case even the initial pay amount exceeds the saved amount already, you want to default to this record. So the main task is to find a way of ranking the records. In my query I do it like this:
Prefer records where the running total does not exceed the saved amount.
Then look at the abolute of their difference and take the smallest.
There are certainly other ways that achieve the same. Maybe even methods that you find more readable. Then just adjust the order by clause in the ranking query.
with summed as
(
select
t1.*,
from #table1 t1
join
(
select
ilno,
payamount,
sum(payamount) over (partition by accountno order by ilno) as total_amount
from #table2
) on t2.accountno = t1.accountno
)
, ranked as
(
select summed.*,
row_number() over (partition by accountno
order by case when saved_amount >= total_amount then 1 else 2 end,
abs(saved_amount - total_amount)
) as rn
)
select *
from ranked
where rn = 1;
This is not the "nearest sum", as you said in the title, but the one that obeys the specified rules. So with a saved amount of 100 and paid amounts of first 1 and then 100, you'd get the record with a total of 1 (which is 99 less than the saved amount) and not the one with a total of 101 (which is only 1 more than the saved amount).
Other way to solve using flags:
first calculated one flag to point if saved_amount >= payamount for current row
calculated three more flags:
group_flag to show is there a case where saved_amount >= payamount for the given accountno
[min_ilno] and [max_ilno] for given account
Having this flags, the final result set is calculated easily. Here is the code:
WITH DataSource AS
(
SELECT a.*
,b.ilno
,b.payamount
,SUM(payamount) OVER (PARTITION BY a.accountno ORDER BY ilno) AS total_amount
,IIF(a.saved_amount >= SUM(payamount) OVER (PARTITION BY a.accountno ORDER BY ilno), 1, 0) AS [flag]
FROM #table1 a
LEFT JOIN #table2 b
ON a.accountno = b.accountno
),
DataSourceFinal AS
(
SELECT *
,MAX(flag) OVER (PARTITION BY accountno) as [group_flag]
,MIN(IIF(flag = 0 ,ilno, NULL)) OVER (PARTITION BY accountno) as [min_ilno]
,MAX(IIF(flag = 1 ,ilno, NULL)) OVER (PARTITION BY accountno) as [max_ilno]
FROM DataSource
)
SELECT accountno, saved_amount, ilno, payamount, total_amount
FROM DataSourceFinal
WHERE ([group_flag] = 1 AND [ilno] = [max_ilno])
OR ([group_flag] = 0 AND [ilno] = [min_ilno]);
and the output:

get the most two recent dates for each customer

basically, I need to retrieve the last two dates for customers who purchased in at least two different dates, implying there are some customer who had purchased only in one date, the data has the following form
client_id date
1 2016-07-02
1 2016-07-02
1 2016-06-01
2 2015-06-01
and I would like to get it in the following form
client_id previous_date last_date
1 2016-06-01 2016-07-02
remarques:
a client can have multiple entries for the same date
a client can have entries only for one date, such customer should be discarded
Rank your dates with DENSE_RANK. Then group by client_id and show the last dates (ranked #1 and #2).
select
client_id,
max(case when rn = 2 then date end) as previous_date,
max(case when rn = 1 then date end) as last_date
from
(
select
client_id,
date,
dense_rank() over (partition by client_id order by date desc) as rn
from mytable
)
group by client_id
having max(rn) > 1;
build up:
t=# create table s153 (c int, d date);
CREATE TABLE
t=# insert into s153 values (1,'2016-07-02'), (1,'2016-07-02'),(1,'2016-06-01'),(2,'2016-06-01');
INSERT 0 4
query:
t=# with a as (
select distinct c,d from s153
)
, b as (
select c,nth_value(d,1) over (partition by c order by d) last_date, nth_value(d,2) over (partition by c order by d) prev_date
from a
)
select * from b where prev_date is not null
;
c | last_date | prev_date
---+------------+------------
1 | 2016-06-01 | 2016-07-02
(1 row)
UNTESTED:
We use a common table expression to assign a row number based on the date in descending order and then only include those records having a row number <=2 and then ensure that those having 1 row are excluded by the having.
WITH CTE AS (
SELECT Distinct Client_ID
, Date
, row_number() over (partition by clientID order by date desc) rn
FROM Table)
SELECT Client_ID, min(date) previous_date, max(date) last_date)
FROM CTE
WHERE RN <=2
GROUP BY Client_ID
HAVING max(RN) > 1
All you need is a group by...
--test date
declare #tablename TABLE
(
client_id int,
[date] datetime
);
insert into #tablename
values( 1 , '2016-07-02'),
(1 , '2016-07-02'),
(1 , '2016-06-01'),
(2 , '2015-06-01');
--query
SELECT client_id,MIN([DATE]) AS [PREVIOUS_DATE], MAX([DATE]) AS [LAST_DATE]
FROM #tablename
GROUP BY client_id
Updated
-- create data
create table myTable
(
client_id integer,
given_date date
);
insert into myTable
values( 1 , '2016-07-02'),
(1 , '2016-07-02'),
(1 , '2016-06-01'),
(1 , '2016-06-03'),
(1 , '2016-06-09'),
(2 , '2015-06-01'),
(3 , '2016-06-03'),
(3 , '2016-06-09');
-- query
SELECT sub.client_id, sub.PREVIOUS_DATE, sub.LAST_DATE
FROM
(select
ROW_NUMBER() OVER (PARTITION BY a.client_id order by b.given_date desc,(MAX(b.given_date) - a.given_date)) AS ROW_NUMBER,
a.client_id,a.given_date AS PREVIOUS_DATE, MAX(b.given_date) - a.given_date AS diff, (b.given_date) AS LAST_DATE
FROM myTable AS a
JOIN myTable AS b
ON b.client_id = a.client_id
WHERE a.given_date <> b.given_date
group by a.client_id, a.given_date, b.given_date) AS sub
WHERE sub.ROW_NUMBER = 1

How to find max value from each group and display their information when using "group by"

For example, i create a table about people contribue to 2 campaigns
+-------------------------------------+
| ID Name Campaign Amount (USD) |
+-------------------------------------+
| 1 A 1 10 |
| 2 B 1 5 |
| 3 C 2 7 |
| 4 D 2 9 |
+-------------------------------------+
Task: For each campaign, find the person (Name, ID) who contribute the most to
Expected result is
+-----------------------------------------+
| Campaign Name ID |
+-----------------------------------------+
| 1 A 1 |
| 2 D 4 |
+-----------------------------------------+
I used "group by Campaign" but the result have 2 columns "Campagin" and "max value" when I need "Name" and "ID"
Thanks for your help.
Edited: I fix some values, really sorry
You can use analytic functions for this:
select name, id, amount
from (select t.*, max(amount) over (partition by campaign) as max_amount
from t
) t
where amount = max_amount;
You can also do it by giving a rank/row_number partiton by campaign and order by descending order of amount.
Query
;with cte as(
select [num] = dense_rank() over(
partition by [Campaign]
order by [Amount] desc
), *
from [your_table_name]
)
select [Campaign], [Name], [ID]
from cte
where [num] = 1;
Try the next query:-
SELECT Campaign , Name , ID
FROM (
SELECT Campaign , Name , ID , MAX (Amount)
FROM MyTable
GROUP BY Campaign , Name , ID
) temp;
Simply use Where Clause with the max of amount group by Campaign:-
As following generic code:-
select a, b , c
from tablename
where d in
(
select max(d)
from tablename
group by a
)
Demo:-
Create table #MyTable (ID int , Name char(1), Campaign int , Amount int)
go
insert into #MyTable values (1,'A',1,10)
insert into #MyTable values (2,'B',1,5)
insert into #MyTable values (3,'C',2,7)
insert into #MyTable values (4,'D',2,9)
go
select Campaign, Name , ID
from #MyTable
where Amount in
(
select max(Amount)
from #MyTable
group by Campaign
)
drop table #MyTable
Result:-
Please find the below code for the same
SELECT *
FROM #MyTable T
OUTER APPLY (
SELECT COUNT(1) record
FROM #MyTable T1
where t.Campaign = t1.Campaign
and t.amount < t1.amount
)E
where E.record = 0

TSQL getting max and min date with a seperate but not unique record

example table:
test_date | test_result | unique_ID
12/25/15 | 100 | 50
12/01/15 | 150 | 75
10/01/15 | 135 | 75
09/22/14 | 99 | 50
04/10/13 | 125 | 50
I need to find the first and last test date as well as the test result to match said date by user. So, I can group by ID, but not test result.
SELECT MAX(test_date)[need matching test_result],
MIN(test_date) [need matching test_result],
unique_id
from [table]
group by unique_id
THANKS!
Create TABLE #t
(
test_date date ,
Test_results int,
Unique_id int
)
INSERT INTO #t
VALUES ( '12/25/15',100,50 ),
( '12/01/15',150,75 ),
( '10/01/15',135,75 ),
( '09/22/14',99,50 ),
( '04/10/13',125,50 )
select 'MinTestDate' as Type, a.test_date, a.Test_results, a.Unique_id
from #t a inner join (
select min(test_date) as test_datemin, max(test_date) as test_datemax, unique_id from #t
group by unique_ID) b
on a.test_date = b.test_datemin
union all
select 'MaxTestDate' as Type, a.test_date, a.Test_results, a.Unique_id from #t a
inner join (
select min(test_date) as test_datemin, max(test_date) as test_datemax, unique_id from #t
group by unique_ID) b
on a.test_date = b.test_datemax
I would recommend window functions. The following returns the information on 2 rows per id:
select t.*
from (select t.*,
row_number() over (partition by unique_id order by test_date) as seqnum_asc,
row_number() over (partition by unique_id order by test_date desc) as seqnum_desc
from table t
) t;
For one row, use conditional aggregation (or pivot if you prefer):
select unique_id,
min(test_date), max(case when seqnum_asc = 1 then test_result end),
max(test_date), max(case when seqnum_desc = 1 then test_result end)
from (select t.*,
row_number() over (partition by unique_id order by test_date) as seqnum_asc,
row_number() over (partition by unique_id order by test_date desc) as seqnum_desc
from table t
) t
group by unique_id;
Consider using a combination of self-joins and derived tables:
SELECT t1.unique_id, minTable.MinOftest_date, t1.test_result As Mintestdate_result,
maxTable.MaxOftest_date, t2.test_result As Maxtestdate_result
FROM TestTable AS t1
INNER JOIN
(
SELECT Min(TestTable.test_date) AS MinOftest_date,
TestTable.unique_ID
FROM TestTable
GROUP BY TestTable.unique_ID
) As minTable
ON (t1.test_date = minTable.MinOftest_date
AND t1.unique_id = minTable.unique_id)
INNER JOIN TestTable As t2
INNER JOIN
(
SELECT Max(TestTable.test_date) AS MaxOftest_date,
TestTable.unique_ID
FROM TestTable
GROUP BY TestTable.unique_ID
) AS maxTable
ON t2.test_date = maxTable.MaxOftest_date
AND t2.unique_ID = maxTable.unique_ID
ON minTable.unique_id = maxTable.unique_id;
OUTPUT
unique_id MinOftest_date Mintestdate_result MaxOftest_date Maxtestdate_result
50 4/10/2013 125 12/25/2015 100
75 10/1/2015 135 12/1/2015 150