Finding Duplicate Orders (by time proximity) - sql

I have a table of orders that I know have duplicates
customer order_number order_date
---------- ------------ -------------------
1 1 2012-03-01 01:58:00
1 2 2012-03-01 02:01:00
1 3 2012-03-01 02:03:00
2 4 2012-03-01 02:15:00
3 5 2012-03-01 02:18:00
3 6 2012-03-01 04:30:00
4 7 2012-03-01 04:35:00
5 8 2012-03-01 04:38:00
6 9 2012-03-01 04:58:00
6 10 2012-03-01 04:59:00
I want to find all duplicates (order by same customer within 60 minutes of eachother). Either a resultset consisting of the 'duplicate' rows or a set of all customers with a count of how many duplicates.
Here is what I have tried
SELECT
customer,
count(*)
FROM
orders
GROUP BY
customer,
DATEPART(HOUR, order_date)
HAVING (count(*) > 1)
This doesn't work when duplicates are within 60 minutes of each other but are in different hours i.e 1:58 and 2:02
I've also tried this
SELECT
o1.customer,
o1.order_number,
o2.order_number,
DATEDIFF(MINUTE,o1.order_date, o2.order_date) AS [diff]
FROM
orders o1 LEFT OUTER JOIN
orders o2 ON o1.customer = o2.customer AND o1.order_number <> o2.order_number
WHERE
ABS(DATEDIFF(MINUTE,o1.order_date, o2.order_date)) < 60
Now this gives me all of the duplicates but it also gives me multiple rows per duplicate order. i.e (o1, o2) and (o2, o1) which wouldn't be so bad if there were'nt some orders with multiple duplicates. In those cases I get (o1, o2), (o1,o3), (o2, o1), (o2, o3), (o3, o1), (o3, o2) etc. I get all of the permutations.
Anyone have some insight? I'm not necessarily looking for the best performing answer here, just one that works.

SELECT
*,
CASE WHEN EXISTS (SELECT *
FROM orders AS lookup
WHERE customer = orders.customer
AND order_date < orders.order_date
AND order_date >= DATEADD(hour, -1, order_date)
)
THEN 'Principle Order'
ELSE 'Duplicate Order'
END as Order_Status
FROM
orders
Using EXISTS and a correlated sub-query you can check if there were any preceding orders in the last hour.

Maybe something like this:
Test data:
DECLARE #tbl TABLE(customer INT,order_number INT,order_date DATETIME)
INSERT INTO #tbl
VALUES
(1,1,'2012-03-01 01:58:00'),
(1,2,'2012-03-01 02:01:00'),
(1,3,'2012-03-01 02:03:00'),
(2,4,'2012-03-01 02:15:00'),
(3,5,'2012-03-01 02:18:00'),
(3,6,'2012-03-01 04:30:00'),
(4,7,'2012-03-01 04:35:00'),
(5,8,'2012-03-01 04:38:00'),
(6,9,'2012-03-01 04:58:00'),
(6,10,'2012-03-01 04:59:00')
Query
;WITH CTE
AS
(
SELECT
MIN(datediff(minute,'1990-1-1',order_date)) OVER(PARTITION BY customer) AS minDate,
datediff(minute,'1990-1-1',order_date) AS DateTicks,
tbl.customer
FROM
#tbl AS tbl
)
SELECT
CTE.customer,
SUM(CASE WHEN (CTE.DateTicks-CTE.minDate)<60 THEN 1 ELSE 0 END)
FROM
CTE
GROUP BY
CTE.customer

The following query identifies all possible permutations of orders within the proximity of 60 minutes of one another:
DECLARE #orders TABLE (CustomerId INT, OrderId INT, OrderDate DATETIME)
INSERT INTO #orders
VALUES
(1, 1, '2012-03-01 01:58:00'),
(1, 2, '2012-03-01 02:01:00'),
(1, 3, '2012-03-01 02:03:00'),
(2, 4, '2012-03-01 02:15:00'),
(3, 5, '2012-03-01 02:18:00'),
(3, 6, '2012-03-01 04:30:00'),
(4, 7, '2012-03-01 04:35:00'),
(5, 8, '2012-03-01 04:38:00'),
(6, 9, '2012-03-01 04:58:00'),
(6, 10, '2012-03-01 04:59:00');
with ProximityOrderCascade(CustomerId, OrderId, ProximateOrderId, MinutesDifference, OrderDate, ProximateOrderDate)
as
(
select o.customerid, o.orderid, null, null, o.orderdate, o.orderdate
from #orders o
union all
select o.customerid, o.orderid, p.orderid, datediff(minute, p.OrderDate, o.OrderDate), o.OrderDate, p.OrderDate
from ProximityOrderCascade p
inner join #orders o
on p.customerid = o.customerid
and abs(datediff(minute, p.OrderDate, o.OrderDate)) between 0 and 60
and o.orderid <> p.orderid
where proximateorderid is null
)
select * from ProximityOrderCascade
where
not ProximateOrderId is null
From there, you can transform the results into a query of your choice. The results of this function identify only customers 1 and 6 as having 'duplicate' orders.
CustomerId OrderId ProximateOrderId MinutesDifference OrderDate ProximateOrderDate
----------- ----------- ---------------- ----------------- ----------------------- -----------------------
6 9 10 -1 2012-03-01 04:58:00.000 2012-03-01 04:59:00.000
6 10 9 1 2012-03-01 04:59:00.000 2012-03-01 04:58:00.000
1 1 3 -5 2012-03-01 01:58:00.000 2012-03-01 02:03:00.000
1 2 3 -2 2012-03-01 02:01:00.000 2012-03-01 02:03:00.000
1 1 2 -3 2012-03-01 01:58:00.000 2012-03-01 02:01:00.000
1 3 2 2 2012-03-01 02:03:00.000 2012-03-01 02:01:00.000
1 2 1 3 2012-03-01 02:01:00.000 2012-03-01 01:58:00.000
1 3 1 5 2012-03-01 02:03:00.000 2012-03-01 01:58:00.000
(8 row(s) affected)

Related

SQL server group / partition to condense history table

Got a table of dates someone was in a particular category like this:
drop table if exists #category
create table #category (personid int, categoryid int, startdate datetime, enddate datetime)
insert into #category
select * from
(
select 1 Personid, 1 CategoryID, '01/04/2010' StartDate, '31/07/2016' EndDate union
select 1 Personid, 5 CategoryID, '07/08/2016' StartDate, '31/03/2019' EndDate union
select 1 Personid, 5 CategoryID, '01/04/2019' StartDate, '01/04/2019' EndDate union
select 1 Personid, 5 CategoryID, '02/04/2019' StartDate, '11/08/2019' EndDate union
select 1 Personid, 4 CategoryID, '12/08/2019' StartDate, '03/11/2019' EndDate union
select 1 Personid, 5 CategoryID, '04/11/2019' StartDate, '22/03/2020' EndDate union
select 1 Personid, 5 CategoryID, '23/03/2020' StartDate, NULL EndDate union
select 2 Personid, 1 CategoryID, '01/04/2010' StartDate, '09/04/2015' EndDate union
select 2 Personid, 4 CategoryID, '10/04/2015' StartDate, '31/03/2018' EndDate union
select 2 Personid, 4 CategoryID, '01/04/2018' StartDate, '31/03/2019' EndDate union
select 2 Personid, 4 CategoryID, '01/04/2019' StartDate, '23/06/2019' EndDate union
select 2 Personid, 4 CategoryID, '24/06/2019' StartDate, NULL EndDate
) x
order by personid, startdate
I'm trying to condense it so I get this:
PersonID
categoryid
startdate
EndDate
1
1
01/04/2010
31/07/2016
1
5
07/08/2016
11/08/2019
1
4
12/08/2019
03/11/2019
1
5
04/11/2019
NULL
2
1
01/04/2010
09/04/2015
2
4
01/04/2015
NULL
I'm having issues with people like personid 1 where they are in (e.g.) category 5, then go into category 4 and them back into category 5.
So doing something like:
select
personid,
categoryid,
min(startdate) startdate,
max(enddate) enddate
from #category
group by
personid, categoryid
gives me the earliest date from category 5's first period, and the latest date from the second period - and means it creates an overlapping period.
So I tried partitioning it with a rownum or rank, but it still does the same thing - i.e. treats the 'category 5's as the same group:
select
rank() over (partition by personid, categoryid order by personid, startdate) rank,
c.*
from #category c
order by personid, startdate
rank
personid
categoryid
startdate
enddate
1
1
1
2010-04-01 00:00:00.000
2016-07-31 00:00:00.000
1
1
5
2016-08-07 00:00:00.000
2019-03-31 00:00:00.000
2
1
5
2019-04-01 00:00:00.000
2019-04-01 00:00:00.000
3
1
5
2019-04-02 00:00:00.000
2019-08-11 00:00:00.000
1
1
4
2019-08-12 00:00:00.000
2019-11-03 00:00:00.000
4
1
5
2019-11-04 00:00:00.000
2020-03-22 00:00:00.000
5
1
5
2020-03-23 00:00:00.000
NULL
1
2
1
2010-04-01 00:00:00.000
2015-04-09 00:00:00.000
1
2
4
2015-04-10 00:00:00.000
2018-03-31 00:00:00.000
2
2
4
2018-04-01 00:00:00.000
2019-03-31 00:00:00.000
3
2
4
2019-04-01 00:00:00.000
2019-06-23 00:00:00.000
4
2
4
2019-06-24 00:00:00.000
NULL
You can see in the rank column that the category 5's start off 1,2,3, miss a row and carry on 4, 5 so obvs in the same partition - I thought that adding the order by clause would force it to start a new partition when the category changed from 5 to 4 and back again.
Any thoughts?
This is a type of gaps and islands problem. However, if your data tiles perfectly (no gaps) as it does in your example data, then you can do this without any aggregation at all -- which should be the most efficient method:
select personid, categoryid, startdate,
dateadd(day, -1, lead(startdate) over (partition by personid order by startdate)) as enddate
from (select c.*,
lag(categoryid) over (partition by personid order by startdate) as prev_categoryid
from #category c
) c
where prev_categoryid is null or prev_categoryid <> categoryid;
The where clause only selects the rows where the category changes. The lead() then gets the next start date -- and subtracts 1 for your desired enddate.

Aggregate log table records to avoid redundancy

I have a table for product change tracking that looks like this:
CREATE TABLE ProductHistory (
ProductId INT NOT NULL,
Name NVARCHAR(50) NOT NULL,
Price MONEY NOT NULL,
StartDate DATETIME NOT NULL,
EndDate DATETIME NOT NULL
)
INSERT INTO ProductHistory VALUES
(1, 'Phone', 100, '2020-11-20 00:00', '2020-11-20 01:00'), /* initial */
(1, 'Phone', 100, '2020-11-20 01:01', '2020-11-20 02:00'), /* no change */
(1, 'Phone', 200, '2020-11-20 02:01', '2020-11-20 03:00'), /* no change, current */
(2, 'Apple', 5, '2020-11-20 00:00', '2020-11-20 01:00'), /* initial */
(2, 'Apple', 10, '2020-11-20 01:01', '2020-11-20 02:00'), /* changed */
(2, 'Pineapple', 10, '2020-11-20 02:01', '2020-11-20 03:00'), /* no change, current */
(3, 'Orange juice', 100, '2020-11-21 00:00', '2020-11-21 01:00'), /* initial */
(3, 'Orange juice', 100, '2020-11-21 01:01', '2020-11-21 02:00'), /* no change */
(3, 'Orange juice', 100, '2020-11-21 02:01', '2020-11-21 03:00') /* no change, current */
I was hoping to come up with a query to get the results below. Notice that the records without actual changes are supposed to be merged together so that there is no redudancy.
ProductId Name Price StartDate EndDate
----------- -------------- ------- -------------------------------------- --------------------------------------
1 Phone 100 2020-11-20 00:00:00.000 (first row) 2020-11-20 02:00:00.000 (second row)
1 Phone 200 2020-11-20 02:01:00.000 (third row) 2020-11-20 03:00:00.000 (third row)
2 Apple 5 2020-11-20 00:00:00.000 (first row) 2020-11-20 01:00:00.000 (first row)
2 Apple 10 2020-11-20 01:01:00.000 (second row) 2020-11-20 02:00:00.000 (second row)
2 Pineapple 10 2020-11-20 02:01:00.000 (third row) 2020-11-20 03:00:00.000 (third row)
3 Orange juice 100 2020-11-21 00:00:00.000 (first row) 2020-11-20 03:00:00.000 (third row)
The closest I got to was the following:
; WITH history AS (
SELECT
ProductId,
Name,
Price,
StartDate,
EndDate
FROM (
SELECT
ROW_NUMBER() OVER (PARTITION BY ProductId ORDER BY StartDate DESC) 'RowNumber',
*
FROM ProductHistory
) history
WHERE history.RowNumber = 1 -- select newest row per ProductId
UNION ALL
SELECT
previous.ProductId,
previous.Name,
previous.Price,
previous.StartDate,
EndDate
FROM (
SELECT
ROW_NUMBER() OVER (PARTITION BY previous.ProductId ORDER BY previous.StartDate DESC) 'RowNumber',
previous.*
FROM history [current]
INNER JOIN ProductHistory previous
ON previous.ProductId = [current].ProductId
AND previous.StartDate < [current].StartDate
AND (
previous.Name <> [current].Name
OR previous.Price <> [current].Price
)
) previous
WHERE previous.RowNumber = 1 -- select previous row of each ProductId, recursively
)
SELECT *
FROM history
ORDER BY
ProductId,
StartDate
ProductId Name Price StartDate EndDate
----------- -------------- -------- ------------------------- -------------------------
1 Phone 100,00 2020-11-20 01:01:00.000 2020-11-20 02:00:00.000
1 Phone 200,00 2020-11-20 02:01:00.000 2020-11-20 03:00:00.000
2 Apple 5,00 2020-11-20 00:00:00.000 2020-11-20 01:00:00.000
2 Apple 10,00 2020-11-20 01:01:00.000 2020-11-20 02:00:00.000
2 Pineapple 10,00 2020-11-20 02:01:00.000 2020-11-20 03:00:00.000
3 Orange juice 100,00 2020-11-21 02:01:00.000 2020-11-21 03:00:00.000
While the Name and Price column values are right, I'm not sure how to aggregate the StartDate and EndDate columns to get what I need. All code is available in this fiddle, if it's of any help.
This is a type of gaps-and-islands problem. Probably the simplest method is the difference of row numbers:
select productid, name, price, min(startdate), max(enddate)
from (select ph.*,
row_number() over (partition by productid order by startdate) as seqnum,
row_number() over (partition by productid, name, price order by startdate) as seqnum_2
from producthistory
) ph
group by productid, name, price, (seqnum - seqnum_2);
This assumes that there are no gaps in the time frames -- which seems reasonable with this data model.
Why does this work? That is a little hard to explain. But if you look at the results of the subquery, you will see how the difference between the two row numbers is constant for adjacent rows where name and price are the same.

Given two date ranged discount tables and product price, calculate date ranged final price

I have two tables with seasonal discounts. In each of these two tables are non overlapping date ranges, product id and discount that applies in that date range. Date ranges from one table however may overlap with date ranges in the other table. Given a third table with product id and its default price, the goal is to efficiently calculate seasonal - date ranged prices for product id after discounts from both tables have been applied.
Discounts multiply only in their overlapping period, e.g. if a first discount is 0.9 (10%) from 2019-07-01 to 2019-07-30, and a second discount is 0.8 from 2019-07-16 to 2019-08-15, this translates to: 0.9 discount from 2019-07-01 to 2019-07-15, 0.72 discount from 2019-07-16 to 2019-07-30, and 0.8 discount from 2019-07-31 to 2019-08-15.
I have managed to come to a solution, by first generating a table that holds ordered all of start and end dates in both discount tables, then generating a resulting table of all smallest disjoint intervals, and then for each interval, generating all prices, default, price with only the discount from first table applied (if any applies), price with only the discount from second table applied (if any applies), price with both discounts applied (if so possible) and then taking a min of these four prices. See sample code bellow.
declare #pricesDefault table (product_id int, price decimal)
insert into #pricesDefault
values
(1, 100),
(2, 120),
(3, 200),
(4, 50)
declare #discountTypeA table (product_id int, modifier decimal(4,2), startdate datetime, enddate datetime)
insert into #discountTypeA
values
(1, 0.75, '2019-06-06', '2019-07-06'),
(1, 0.95, '2019-08-06', '2019-08-20'),
(1, 0.92, '2019-05-06', '2019-06-05'),
(2, 0.75, '2019-06-08', '2019-07-19'),
(2, 0.95, '2019-07-20', '2019-09-20'),
(3, 0.92, '2019-05-06', '2019-06-05')
declare #discountTypeB table (product_id int, modifier decimal(4,2), startdate datetime, enddate datetime)
insert into #discountTypeB
values
(1, 0.85, '2019-06-20', '2019-07-03'),
(1, 0.65, '2019-08-10', '2019-08-29'),
(1, 0.65, '2019-09-10', '2019-09-27'),
(3, 0.75, '2019-05-08', '2019-05-19'),
(2, 0.95, '2019-05-20', '2019-05-21'),
(3, 0.92, '2019-09-06', '2019-09-09')
declare #pricingPeriod table(product_id int, discountedPrice decimal, startdate datetime, enddate datetime);
with allDates(product_id, dt) as
(select distinct product_id, dta.startdate from #discountTypeA dta
union all
select distinct product_id, dta.enddate from #discountTypeA dta
union all
select distinct product_id, dtb.startdate from #discountTypeB dtb
union all
select distinct product_id, dtb.enddate from #discountTypeB dtb
),
allproductDatesWithId as
(select product_id, dt, row_number() over (partition by product_id order by dt asc) 'Id'
from allDates),
sched as
(select pd.product_id, apw1.dt startdate, apw2.dt enddate
from #pricesDefault pd
join allproductDatesWithId apw1 on apw1.product_id = pd.product_id
join allproductDatesWithId apw2 on apw2.product_id = pd.product_id and apw2.Id= apw1.Id+1
),
discountAppliedTypeA as(
select sc.product_id, sc.startdate, sc.enddate,
min(case when sc.startdate >= dta.startdate and dta.enddate >= sc.enddate then pd.price * dta.modifier else pd.price end ) 'price'
from sched sc
join #pricesDefault pd on pd.product_id = sc.product_id
left join #discountTypeA dta on sc.product_id = dta.product_id
group by sc.product_id, sc.startdate , sc.enddate ),
discountAppliedTypeB as(
select daat.product_id, daat.startdate, daat.enddate,
min(case when daat.startdate >= dta.startdate and dta.enddate >= daat.enddate then daat.price * dta.modifier else daat.price end ) 'price'
from discountAppliedTypeA daat
left join #discountTypeB dta on daat.product_id = dta.product_id
group by daat.product_id, daat.startdate , daat.enddate )
select * from discountAppliedTypeB
order by product_id, startdate
Calculating a min of all possible prices is unnecessary overhead. I'd like to generate, just one resulting price and have it as a final price.
Here is the resulting set:
product_id start_date end_date final_price
1 2019-05-06 00:00:00.000 2019-06-05 00:00:00.000 92.0000
1 2019-06-05 00:00:00.000 2019-06-06 00:00:00.000 100.0000
1 2019-06-06 00:00:00.000 2019-06-20 00:00:00.000 75.0000
1 2019-06-20 00:00:00.000 2019-07-03 00:00:00.000 63.7500
1 2019-07-03 00:00:00.000 2019-07-06 00:00:00.000 75.0000
1 2019-07-06 00:00:00.000 2019-08-06 00:00:00.000 100.0000
1 2019-08-06 00:00:00.000 2019-08-10 00:00:00.000 95.0000
1 2019-08-10 00:00:00.000 2019-08-20 00:00:00.000 61.7500
1 2019-08-20 00:00:00.000 2019-08-29 00:00:00.000 65.0000
1 2019-08-29 00:00:00.000 2019-09-10 00:00:00.000 100.0000
1 2019-09-10 00:00:00.000 2019-09-27 00:00:00.000 65.0000
2 2019-05-20 00:00:00.000 2019-05-21 00:00:00.000 114.0000
2 2019-05-21 00:00:00.000 2019-06-08 00:00:00.000 120.0000
2 2019-06-08 00:00:00.000 2019-07-19 00:00:00.000 90.0000
2 2019-07-19 00:00:00.000 2019-07-20 00:00:00.000 120.0000
2 2019-07-20 00:00:00.000 2019-09-20 00:00:00.000 114.0000
3 2019-05-06 00:00:00.000 2019-05-08 00:00:00.000 184.0000
3 2019-05-08 00:00:00.000 2019-05-19 00:00:00.000 138.0000
3 2019-05-19 00:00:00.000 2019-06-05 00:00:00.000 184.0000
3 2019-06-05 00:00:00.000 2019-09-06 00:00:00.000 200.0000
3 2019-09-06 00:00:00.000 2019-09-09 00:00:00.000 184.0000
Is there a more efficient to this solution that I am not seeing?
I have a large data set of ~20K rows in real product prices table, and 100K- 200K rows in both discount tables.
Indexing structure of the actual tables is following: product id is clustered index in product prices table, whilst discount tables have an Id surrogate column as clustered index (as well as primary key), and (product_id, start_date, end_date) as a non clustered index.
You can generate the dates using union. Then bring in all discounts that are valid on that date, and calculate the total.
This looks like:
with prices as (
select a.product_id, v.dte
from #discountTypeA a cross apply
(values (a.startdate), (a.enddate)) v(dte)
union -- on purpose to remove duplicates
select b.product_id, v.dte
from #discountTypeB b cross apply
(values (b.startdate), (b.enddate)) v(dte)
),
p as (
select p.*, 1-a.modifier as a_discount, 1-b.modifier as b_discount, pd.price
from prices p left join
#pricesDefault pd
on pd.product_id = p.product_id left join
#discountTypeA a
on p.product_id = a.product_id and
p.dte >= a.startdate and p.dte < a.enddate left join
#discountTypeb b
on p.product_id = b.product_id and
p.dte >= b.startdate and p.dte < b.enddate
)
select p.product_id, price * (1 - coalesce(a_discount, 0)) * (1 - coalesce(b_discount, 0)) as price, a_discount, b_discount,
dte as startdate, lead(dte) over (partition by product_id order by dte) as enddate
from p
order by product_id, dte;
Here is a db<>fiddle.
Here is a version that works out the price for every date. You can then either use this directly, or use one of the many solutions on SO for working out date ranges.
In this example I have hard coded the date limits, but you could easily read them from your tables if you prefer.
I haven't done any performance testing on this, but give it a go. Its quite a bit simpler do if you have the right indexes it might be quicker.
;with dates as (
select convert(datetime,'2019-05-06') as d
union all
select d+1 from dates where d<'2019-09-27'
)
select pricesDefault.product_id, d, pricesDefault.price as baseprice,
discountA.modifier as dA,
discountB.modifier as dB,
pricesDefault.price*isnull(discountA.modifier,1)*isnull(discountB.modifier,1) as finalprice
from #pricesDefault pricesDefault
cross join dates
left join #discountTypeA discountA on discountA.product_id=pricesDefault.product_id and d between discountA.startdate and discountA.enddate
left join #discountTypeB discountB on discountB.product_id=pricesDefault.product_id and d between discountB.startdate and discountB.enddate
order by pricesDefault.product_id, d
Option (MaxRecursion 1000)

Sql group by latest repeated field

I don't even know what's a good title for this question.
But I'm having a table:
create table trans
(
[transid] INT IDENTITY (1, 1) NOT NULL,
[customerid] int not null,
[points] decimal(10,2) not null,
[date] datetime not null
)
and records:
--cus1
INSERT INTO trans ( customerid , points , date )
VALUES ( 1, 10, '2016-01-01' ) , ( 1, 20, '2017-02-01' ) , ( 1, 22, '2017-03-01' ) ,
( 1, 24, '2018-02-01' ) , ( 1, 50, '2018-02-25' ) , ( 2, 44, '2016-02-01' ) ,
( 2, 20, '2017-02-01' ) , ( 2, 32, '2017-03-01' ) , ( 2, 15, '2018-02-01' ) ,
( 2, 10, '2018-02-25' ) , ( 3, 10, '2018-02-25' ) , ( 4, 44, '2015-02-01' ) ,
( 4, 20, '2015-03-01' ) , ( 4, 32, '2016-04-01' ) , ( 4, 15, '2016-05-01' ) ,
( 4, 10, '2017-02-25' ) , ( 4, 10, '2018-02-27' ) ,( 4, 20, '2018-02-28' ) ,
( 5, 44, '2015-02-01' ) , ( 5, 20, '2015-03-01' ) , ( 5, 32, '2016-04-01' ) ,
( 5, 15, '2016-05-01' ) ,( 5, 10, '2017-02-25' );
-- selecting the data
select * from trans
Produces:
transid customerid points date
----------- ----------- --------------------------------------- -----------------------
1 1 10.00 2016-01-01 00:00:00.000
2 1 20.00 2017-02-01 00:00:00.000
3 1 22.00 2017-03-01 00:00:00.000
4 1 24.00 2018-02-01 00:00:00.000
5 1 50.00 2018-02-25 00:00:00.000
6 2 44.00 2016-02-01 00:00:00.000
7 2 20.00 2017-02-01 00:00:00.000
8 2 32.00 2017-03-01 00:00:00.000
9 2 15.00 2018-02-01 00:00:00.000
10 2 10.00 2018-02-25 00:00:00.000
11 3 10.00 2018-02-25 00:00:00.000
12 4 44.00 2015-02-01 00:00:00.000
13 4 20.00 2015-03-01 00:00:00.000
14 4 32.00 2016-04-01 00:00:00.000
15 4 15.00 2016-05-01 00:00:00.000
16 4 10.00 2017-02-25 00:00:00.000
17 4 10.00 2018-02-27 00:00:00.000
18 4 20.00 2018-02-28 00:00:00.000
19 5 44.00 2015-02-01 00:00:00.000
20 5 20.00 2015-03-01 00:00:00.000
21 5 32.00 2016-04-01 00:00:00.000
22 5 15.00 2016-05-01 00:00:00.000
23 5 10.00 2017-02-25 00:00:00.000
I'm trying to group all the customerid and sum their points. But here's the catch, If the trans is not active for 1 year(the next tran is 1 year and above), the points will be expired.
For this case:
Points for each customers should be:
Customer1 20+22+24+50
Customer2 20+32+15+10
Customer3 10
Customer4 10+20
Customer5 0
Here's what I have so far:
select
t1.transid as transid1,
t1.customerid as customerid1,
t1.date as date1,
t1.points as points1,
t1.rank1 as rank1,
t2.transid as transid2,
t2.customerid as customerid2,
t2.points as points2,
isnull(t2.date,getUTCDate()) as date2,
isnull(t2.rank2,t1.rank1+1) as rank2,
cast(case when(t1.date > dateadd(year,-1,isnull(t2.date,getUTCDate()))) Then 0 ELSE 1 END as bit) as ShouldExpire
from
(
select transid,CustomerID,Date,points,
RANK() OVER(PARTITION BY CustomerID ORDER BY date ASC) AS RANK1
from trans
)t1
left join
(
select transid,CustomerID,Date,points,
RANK() OVER(PARTITION BY CustomerID ORDER BY date ASC) AS RANK2
from trans
)t2 on t1.RANK1=t2.RANK2-1
and t1.customerid=t2.customerid
which gives
from the above table,how do I check for ShouldExpire field having max(rank1) for customer, if it's 1, then totalpoints will be 0, otherwise,sum all the consecutive 0's until there are no more records or a 1 is met?
Or is there a better approach to this problem?
The following query uses LEAD to get the date of the next record withing the same CustomerID slice:
;WITH CTE AS (
SELECT transid, CustomerID, [Date], points,
LEAD([Date]) OVER (PARTITION BY CustomerID
ORDER BY date ASC) AS nextDate,
CASE
WHEN [date] > DATEADD(YEAR,
-1,
-- same LEAD() here as above
ISNULL(LEAD([Date]) OVER (PARTITION BY CustomerID
ORDER BY date ASC),
getUTCDate()))
THEN 0
ELSE 1
END AS ShouldExpire
FROM trans
)
SELECT transid, CustomerID, [Date], points, nextDate, ShouldExpire
FROM CTE
ORDER BY CustomerID, [Date]
Output:
transid CustomerID Date points nextDate ShouldExpire
-------------------------------------------------------------
1 1 2016-01-01 10.00 2017-02-01 1 <-- last exp. for 1
2 1 2017-02-01 20.00 2017-03-01 0
3 1 2017-03-01 22.00 2018-02-01 0
4 1 2018-02-01 24.00 2018-02-25 0
5 1 2018-02-25 50.00 NULL 0
6 2 2016-02-01 44.00 2017-02-01 1 <-- last exp. for 2
7 2 2017-02-01 20.00 2017-03-01 0
8 2 2017-03-01 32.00 2018-02-01 0
9 2 2018-02-01 15.00 2018-02-25 0
10 2 2018-02-25 10.00 NULL 0
11 3 2018-02-25 10.00 NULL 0 <-- no exp. for 3
12 4 2015-02-01 44.00 2015-03-01 0
13 4 2015-03-01 20.00 2016-04-01 1
14 4 2016-04-01 32.00 2016-05-01 0
15 4 2016-05-01 15.00 2017-02-25 0
16 4 2017-02-25 10.00 2018-02-27 1 <-- last exp. for 4
17 4 2018-02-27 10.00 2018-02-28 0
18 4 2018-02-28 20.00 NULL 0
19 5 2015-02-01 44.00 2015-03-01 0
20 5 2015-03-01 20.00 2016-04-01 1
21 5 2016-04-01 32.00 2016-05-01 0
22 5 2016-05-01 15.00 2017-02-25 0
23 5 2017-02-25 10.00 NULL 1 <-- last exp. for 5
Now, you seem to want to calculate the sum of points after the last expiration.
Using the above CTE as a basis you can achieve the required result with:
;WITH CTE AS (
... above query here ...
)
SELECT CustomerID,
SUM(CASE WHEN rnk = 0 THEN points ELSE 0 END) AS sumOfPoints
FROM (
SELECT transid, CustomerID, [Date], points, nextDate, ShouldExpire,
SUM(ShouldExpire) OVER (PARTITION BY CustomerID ORDER BY [Date] DESC) AS rnk
FROM CTE
) AS t
GROUP BY CustomerID
Output:
CustomerID sumOfPoints
-----------------------
1 116.00
2 77.00
3 10.00
4 30.00
5 0.00
Demo here
The tricky part here is to dump all points when they expire, and start accumulating them again. I assumed that if there was only one transaction that we don't expire the points until there's a new transaction, even if that first transaction was over a year ago now?
I also get a different answer for customer #5, as they do appear to have a "transaction chain" that hasn't expired?
Here's my query:
WITH ordered AS (
SELECT
*,
ROW_NUMBER() OVER (PARTITION BY customerid ORDER BY [date]) AS order_id
FROM
trans),
max_transid AS (
SELECT
customerid,
MAX(transid) AS max_transid
FROM
trans
GROUP BY
customerid),
not_expired AS (
SELECT
t1.customerid,
t1.points,
t1.[date] AS t1_date,
CASE
WHEN m.customerid IS NOT NULL THEN GETDATE()
ELSE t2.[date]
END AS t2_date
FROM
ordered t1
LEFT JOIN ordered t2 ON t2.customerid = t1.customerid AND t1.transid != t2.transid AND t2.order_id = t1.order_id + 1 AND t1.[date] > DATEADD(YEAR, -1, t2.[date])
LEFT JOIN max_transid m ON m.customerid = t1.customerid AND m.max_transid = t1.transid
),
max_not_expired AS (
SELECT
customerid,
MAX(t1_date) AS max_expired
FROM
not_expired
WHERE
t2_date IS NULL
GROUP BY
customerid)
SELECT
n.customerid,
SUM(n.points) AS points
FROM
not_expired n
LEFT JOIN max_not_expired m ON m.customerid = n.customerid
WHERE
ISNULL(m.max_expired, '19000101') < n.t1_date
GROUP BY
n.customerid;
It could be refactored to be simpler, but I wanted to show the steps to get to the final answer:
customerid points
1 116.00
2 77.00
3 10.00
4 30.00
5 57.00
can you try this:
SELECT customerid,
Sum(t1.points)
FROM trans t1
WHERE NOT EXISTS (SELECT 1
FROM trans t2
WHERE Datediff(year, t1.date, t2.date) >= 1)
GROUP BY t1.customerid
Hope it helps!
try this:
select customerid,Sum(points)
from trans where Datediff(year, date, GETDATE()) < 1
group by customerid
output:
customerid Points
1 - 74.00
2 - 25.00
3 - 10.00
4 - 30.00

New column which indicate min date from group of value

I would like to add new column which will be indicating min value of subgroup.
Id ShopId OrderDate
12232018 12229018 2011-01-01 00:00:00.000
12232018 12229018 2012-01-01 00:00:00.000
12232018 12394018 2012-02-02 00:00:00.000
12232018 11386005 2012-03-01 00:00:00.000
12232018 14347023 2012-04-02 00:00:00.000
12232018 14026026 2014-03-16 00:00:00.000
Here is the result I want to get:
NewCol Id ShopId OrderDate
1 12232018 12229018 2011-01-01 00:00:00.000
1 12232018 12229018 2012-01-01 00:00:00.000
0 12232018 12394018 2012-02-02 00:00:00.000
0 12232018 11386005 2012-03-01 00:00:00.000
0 12232018 14347023 2012-04-02 00:00:00.000
0 12232018 14026026 2014-03-16 00:00:00.000
Because ShopId have min OrderDate for Id I would like to assign '1' to this ShopId.
You can use min with windowing function to get this as below:
select NewCol = Case when orderdate = min(orderdate) over() then 1 else 0 end,*
from yourtable
--Probably you might require to add Partition by Id or shopId depends on requirement
Try this:
SELECT Id, ShopId, OrderDate,
CASE
WHEN MIN(OrderDate) OVER (PARTITION BY Id, ShopId) =
MIN(OrderDate) OVER (PARTITION BY Id) THEN 1
ELSE 0
END AS NewCol
FROM mytable
The query uses windowed version of MAX in order to compare the minimum-per-Id OrderDate to the minimum-per- (Id, ShopId) date. If these two values are the same, then we mark the corresponding (Id, ShopId) partition with 1.
Demo here
Less elegant than the others, but is ANSI
select MyTable.*, case when a1.mindate = orderdate then 1 else 0 end as NewCol
from MyTable
inner join
(
select id, min(orderdate) as mindate
from Mytable
group by id
) a1
on a1.id = MyTable.id
Use min with orderdate on ShopId and use that in the Case When statement like this:-
Select case when (a.OrderDate=b.min_order_dt) then 1 else 0 end as NewCol, a.*
from
your_table_name a
inner join
(
SELECT ShopId, min(OrderDate) as min_order_dt
from
your_table_name
group by shop_id
) b
on a.ShopId=b.ShopId;
Try this
select case when t2.ShopId is null then 0 else 1 end as newcol,t1.id,
t1.ShopiId,t1.OrderDate
from table as t1 left join
(
select ShopId,min(OrderDate) as OrderDate from table
group by ShopId
) as t2 on t1.ShopId=t2.ShopId and t1.OrderDate=t2.OrderDate