Find date sequence in SQL Server - sql

I'm trying to find the maximum sequence of days by customer in my data.
I want to understand what is the max sequence of days that specific customer made. If someone enter to my app in the 25/8/16 AND 26/08/16 AND 27/08/16 AND 01/09/16 AND 02/09/16 - The max sequence will be 3 days (25,26,27).
In the end (The output) I want to get two fields: custid | MaxDaySequence
I have the following fields in my data table:
custid | orderdate(timestemp)
For exmple:
custid orderdate
1 25/08/2007
1 03/10/2007
1 13/10/2007
1 15/01/2008
1 16/03/2008
1 09/04/2008
2 18/09/2006
2 08/08/2007
2 28/11/2007
2 04/03/2008
3 27/11/2006
3 15/04/2007
3 13/05/2007
3 19/06/2007
3 22/09/2007
3 25/09/2007
3 28/01/2008
I'm using SQL Server 2014.
Thanks

There is a trick, if you have an incrementing number ordered by your date then a subtracting that number of days from your dates will be the same if they are consecutive. So like this:
SELECT custid,
min(orderdate) as start_of_group,
max(orderdate) as end_of_group,
count(*) as num_days
FROM (
SELECT custid, orderdate
ROW_NUMBER() OVER (PARTITION BY custid ORDER BY orderdate) as rn
) x
GROUP BY custid, dateadd(day, - rn, orderdate);
You could take the result of this and pull out the max number of days to solve your problem:
SELECT custid, max(num_days) as longest
FROM (
SELECT custid,
count(*) as num_days
FROM (
SELECT custid, orderdate
ROW_NUMBER() OVER (PARTITION BY custid ORDER BY orderdate) as rn
) x
GROUP BY custid, dateadd(day, - rn, orderdate)
) y
GROUP BY custid

If you want to solve it with MySQL:
select user_id,max(num_days) as longest
from(
select user_id, count(*) as num_days
from
(
SELECT (CASE a1.user_id
WHEN #curType
THEN #curRow := #curRow + 1
ELSE #curRow := 1 AND #curType := a1.user_id END
) AS rank,
a1.user_id,
a1.last_update as dat
FROM (select a2.user_id,left(FROM_UNIXTIME(a2.last_update),10) as 'last_update'
from visits as a2 group by 1,2) as a1 ,
(SELECT #curRow := 0, #curType := '') r
ORDER BY a1.user_id DESC, dat) x
group by user_id, DATE_ADD(dat,INTERVAL -rank day)
) y
group by 1
order by longest desc

Related

Find customers with at least 5 transactions in At most 3 consecutive days

I have a table in SQL Server that contains customers' transactions From 2022-02-10 to 2022-03-10.
I want to find customers that have at least 5 transactions on At most three consecutive days
For example, output of below table should be CustomerId = 2 and customerid=3
Id
CustomerId
Transactiondate
1
1
2022-03-01
2
1
2022_03_01
3
1
2022_03_05
4
1
2022_03_07
5
1
2022_03_07
6
2
2022_03_05
7
2
2022_03_05
8
2
2022_03_06
9
2
2022_03_06
10
2
2022_03_07
1
3
2022-03-01
2
3
2022_03_01
3
3
2022_03_01
4
3
2022_03_03
5
3
2022_03_03
I tried this query but it doesn't have good performance for a large table:
select distinct p1.customerid
from trntbl p1
join trntbl p2 on p2.id <> p1.id
and p2.customerid = p1.customerid
and p2.TransactionDate >= p1.TransactionDate
and p2.TransactionDate < date_add(day, 3, p1.prchasedate)
group by p1.customerid, p1.id
having count(*) >= 4
If customers must have done transactions in three consecutive days (meaning that 5 transactions in a day then nothing in the next two days wouldn't count), then this can be done with two self joins:
with cte as
(select CustomerId, Transactiondate, count(*) ct
from table_name
group by CustomerId, Transactiondate)
select distinct t1.CustomerId
from cte t1 inner join cte t2
on t1.Transactiondate = dateadd(day, 1, t2.Transactiondate)
and t1.CustomerId = t2.CustomerId
inner join cte t3
on t2.Transactiondate = dateadd(day, 1, t3.Transactiondate)
and t3.CustomerId = t2.CustomerId
;
Fiddle
Although this is a gaps-and-islands problem, there are shortcuts you can take.
You can group it up by date, then get the row 2 previous, and filter by only rows where the 2 previous row is exactly two days apart.
SELECT DISTINCT
CustomerId
FROM (
SELECT
t.CustomerId,
v.Date,
Prev2 = LAG(v.Date, 2) OVER (PARTITION BY t.CustomerId ORDER BY v.Date)
FROM YourTable t
CROSS APPLY (VALUES( CAST(Transactiondate AS date) )) v(Date)
GROUP BY
t.CustomerId,
v.Date
) t
WHERE DATEDIFF(day, t.Prev2, t.Date) = 2
db<>fiddle
If the base table only has a maximum of one row per date then you can forgo the GROUP BY.
This is actually a gaps and islands problem, you can solve by using analytic window functions to subtract sequential row_number from consecutive days and then grouping, after first "plugging" any gaps with the help of a numbers table.
with numbers as (select top(20) Row_Number() over(order by (select null))-1 n from master.dbo.spt_values),
dRanges as (
select customerId,
Min(Transactiondate) CustStartDate,
Max(Transactiondate) CustEndDate
from t
group by CustomerId
), dates as (
select *
from dranges r
outer apply (
select DateAdd(day,n,r.CustStartDate) SeqDate
from numbers n
where DateAdd(day,n,r.CustStartDate) < = r.CustEndDate
)d
), q as (
select customerId, transactiondate, Count(*) qty
from t
group by CustomerId, Transactiondate
), g as (
select d.CustomerId, d.SeqDate, IsNull(q.qty,0)Qty,
DateAdd(day, - row_number() over (partition by d.customerid order by d.SeqDate), d.SeqDate) as dGrp
from dates d
left join q on q.Transactiondate = d.SeqDate and q.CustomerId = d.CustomerId
)
select customerId
from g
group by CustomerId, dGrp
having Count(*) <= 3 and Sum(qty) >= 5
DB<>Fiddle
You could make use of datediff function and verify if the sum of the date differences are between 3 and 5 (provided the max of the differences is just 1) since the dates might be unique (for example customerid 2 can have transaction dates as 5,6,7,8,9 of March 2022) and this should be taken into account too.
declare #tbl table(id int identity,customerid int,transactiondate date)
insert into #tbl(customerid,transactiondate)
values(1,'2022-03-01')
,(1,'2022-03-01')
,(1,'2022-03-05')
,(1,'2022-03-07')
,(1,'2022-03-07')
,(2,'2022-03-05')
,(2,'2022-03-05')
,(2,'2022-03-06')
,(2,'2022-03-06')
,(2,'2022-03-07')
select customerid from (
select *
,SUM(datediff)over(partition by customerid order by transactiondate)[sum]
,max(datediff)over(partition by customerid order by transactiondate)[max]
from(
select customerid , transactiondate,
DATEDIFF(DAY
,
case when LEAD(transactiondate,1)over(partition by customerid order by transactiondate)
is null then
LAG(transactiondate,1,transactiondate)
over(partition by customerid order by transactiondate)
else
transactiondate end
, case when LEAD(transactiondate,1)over(partition by customerid order by transactiondate)
is null then
transactiondate
else
LEAD(transactiondate,1,transactiondate)
over(partition by customerid order by transactiondate)end) as [datediff]
,ROW_NUMBER()over(partition by customerid order by transactiondate)rownum
from #tbl
)t
)t1
where t1.rownum = 5
and t1.max = 1
and t1.sum between 3 and 5

How do I find the Sum and Max value per Unique ID in HIVE?

basically how do I turn
id name quantity
1 Jerry 1
1 Jerry 2
1 Nana 1
2 Max 4
2 Lenny 3
into
id name quantity
1 Jerry 3
2 Max 4
in HIVE?
I want to sum up and find the highest quantity for each unique ID
You can use window functions with aggregation:
select id, name, quantity
from (select id, name, sum(quantity) as quantity,
row_number() over (partition by id order by sum(quantity) desc) as seqnum
from t
group by id, name
) t
where seqnum = 1;
You can first calculate the sum of quantity per group, then rank them according to descending quantity, and finally filter the rows with rank = 1.
select
id, name, quantity
from (
select
*,
row_number() over (partition by id order by quantity desc) as rn
from (
select id, name, sum(quantity) as quantity
from mytable
group by id, name
)
) where rn = 1;
try like below
with cte as
(
select id,name,sum(quantity) as q
from table_name group by id,name
) select id,name,q from cte t1
where t1.q=( select max(q) from cte t2 where t1.id=t2.id)

Selecting rows that have row_number more than 1

I have a table as following (using bigquery):
id
year
month
sales
row_number
111
2020
11
1000
1
111
2020
12
2000
2
112
2020
11
3000
1
113
2020
11
1000
1
Is there a way in which I can select rows that have row numbers more than one?
For example, my desired output is:
id
year
month
sales
row_number
111
2020
11
1000
1
111
2020
12
2000
2
I don't want to just exclusively select rows with row_number = 2 but also row_number = 1 as well.
The original code block I used for the first table result is:
SELECT
id,
year,
month,
SUM(sales) AS sales,
ROW_NUMBER() OVER (PARTITIONY BY id ORDER BY id ASC) AS row_number
FROM
table
GROUP BY
id, year, month
You can use window functions:
select t.* except (cnt)
from (select t.*,
count(*) over (partition by id) as cnt
from t
) t
where cnt > 1;
As applied to your aggregation query:
SELECT iym.* EXCEPT (cnt)
FROM (SELECT id, year, month,
SUM(sales) as sales,
ROW_NUMBER() OVER (Partition by id ORDER BY id ASC) AS row_number
COUNT(*) OVER(Partition by id ORDER BY id ASC) AS cnt
FROM table
GROUP BY id, year, month
) iym
WHERE cnt > 1;
You can wrap your query as in below example
select * except(flag) from (
select *, countif(row_number > 1) over(partition by id) > 0 flag
from (YOUR_ORIGINAL_QUERY)
)
where flag
so it can look as
select * except(flag) from (
select *, countif(row_number > 1) over(partition by id) > 0 flag
from (
SELECT id,
year,
month,
SUM(sales) as sales,
ROW_NUMBER() OVER(Partition by id ORDER BY id ASC) AS row_number
FROM table
GROUP BY id, year, month
)
)
where flag
so when applied to sample data in your question - it will produce below output
Try this:
with tmp as (SELECT id,
year,
month,
SUM(sales) as sales,
ROW_NUMBER() OVER(Partition by id ORDER BY id ASC) AS row_number
FROM table
GROUP BY id, year, month)
select * from tmp a where exists ( select 1 from tmp b where a.id = b.id and b.row_number =2)
It's a so clearly exists statement SQL
This is what I use, it's similar to #ElapsedSoul answer but from my understanding for static list "IN" is better than using "EXISTS" but I'm not sure if the performance difference, if any, is significant:
Difference between EXISTS and IN in SQL?
WITH T1 AS
(
SELECT
id,
year,
month,
SUM(sales) as sales,
ROW_NUMBER() OVER(PARTITION BY id ORDER BY id ASC) AS ROW_NUM
FROM table
GROUP BY id, year, month
)
SELECT *
FROM T1
WHERE id IN (SELECT id FROM T1 WHERE ROW_NUM > 1);

get the most two recent dates for each customer

basically, I need to retrieve the last two dates for customers who purchased in at least two different dates, implying there are some customer who had purchased only in one date, the data has the following form
client_id date
1 2016-07-02
1 2016-07-02
1 2016-06-01
2 2015-06-01
and I would like to get it in the following form
client_id previous_date last_date
1 2016-06-01 2016-07-02
remarques:
a client can have multiple entries for the same date
a client can have entries only for one date, such customer should be discarded
Rank your dates with DENSE_RANK. Then group by client_id and show the last dates (ranked #1 and #2).
select
client_id,
max(case when rn = 2 then date end) as previous_date,
max(case when rn = 1 then date end) as last_date
from
(
select
client_id,
date,
dense_rank() over (partition by client_id order by date desc) as rn
from mytable
)
group by client_id
having max(rn) > 1;
build up:
t=# create table s153 (c int, d date);
CREATE TABLE
t=# insert into s153 values (1,'2016-07-02'), (1,'2016-07-02'),(1,'2016-06-01'),(2,'2016-06-01');
INSERT 0 4
query:
t=# with a as (
select distinct c,d from s153
)
, b as (
select c,nth_value(d,1) over (partition by c order by d) last_date, nth_value(d,2) over (partition by c order by d) prev_date
from a
)
select * from b where prev_date is not null
;
c | last_date | prev_date
---+------------+------------
1 | 2016-06-01 | 2016-07-02
(1 row)
UNTESTED:
We use a common table expression to assign a row number based on the date in descending order and then only include those records having a row number <=2 and then ensure that those having 1 row are excluded by the having.
WITH CTE AS (
SELECT Distinct Client_ID
, Date
, row_number() over (partition by clientID order by date desc) rn
FROM Table)
SELECT Client_ID, min(date) previous_date, max(date) last_date)
FROM CTE
WHERE RN <=2
GROUP BY Client_ID
HAVING max(RN) > 1
All you need is a group by...
--test date
declare #tablename TABLE
(
client_id int,
[date] datetime
);
insert into #tablename
values( 1 , '2016-07-02'),
(1 , '2016-07-02'),
(1 , '2016-06-01'),
(2 , '2015-06-01');
--query
SELECT client_id,MIN([DATE]) AS [PREVIOUS_DATE], MAX([DATE]) AS [LAST_DATE]
FROM #tablename
GROUP BY client_id
Updated
-- create data
create table myTable
(
client_id integer,
given_date date
);
insert into myTable
values( 1 , '2016-07-02'),
(1 , '2016-07-02'),
(1 , '2016-06-01'),
(1 , '2016-06-03'),
(1 , '2016-06-09'),
(2 , '2015-06-01'),
(3 , '2016-06-03'),
(3 , '2016-06-09');
-- query
SELECT sub.client_id, sub.PREVIOUS_DATE, sub.LAST_DATE
FROM
(select
ROW_NUMBER() OVER (PARTITION BY a.client_id order by b.given_date desc,(MAX(b.given_date) - a.given_date)) AS ROW_NUMBER,
a.client_id,a.given_date AS PREVIOUS_DATE, MAX(b.given_date) - a.given_date AS diff, (b.given_date) AS LAST_DATE
FROM myTable AS a
JOIN myTable AS b
ON b.client_id = a.client_id
WHERE a.given_date <> b.given_date
group by a.client_id, a.given_date, b.given_date) AS sub
WHERE sub.ROW_NUMBER = 1

Group data by the change of grouping column value in order

With the following data
create table #ph (product int, [date] date, price int)
insert into #ph select 1, '20120101', 1
insert into #ph select 1, '20120102', 1
insert into #ph select 1, '20120103', 1
insert into #ph select 1, '20120104', 1
insert into #ph select 1, '20120105', 2
insert into #ph select 1, '20120106', 2
insert into #ph select 1, '20120107', 2
insert into #ph select 1, '20120108', 2
insert into #ph select 1, '20120109', 1
insert into #ph select 1, '20120110', 1
insert into #ph select 1, '20120111', 1
insert into #ph select 1, '20120112', 1
I would like to produce the following output:
product | date_from | date_to | price
1 | 20120101 | 20120105 | 1
1 | 20120105 | 20120109 | 2
1 | 20120109 | 20120112 | 1
If I group by price and show the max and min date then I will get the following which is not what I want (see the over lapping of dates).
product | date_from | date_to | price
1 | 20120101 | 20120112 | 1
1 | 20120105 | 20120108 | 2
So essentially what I'm looking to do is group by the step change in data based on group columns product and price.
What is the cleanest way to achieve this?
There's a (more or less) known technique of solving this kind of problem, involving two ROW_NUMBER() calls, like this:
WITH marked AS (
SELECT
*,
grp = ROW_NUMBER() OVER (PARTITION BY product ORDER BY date)
- ROW_NUMBER() OVER (PARTITION BY product, price ORDER BY date)
FROM #ph
)
SELECT
product,
date_from = MIN(date),
date_to = MAX(date),
price
FROM marked
GROUP BY
product,
price,
grp
ORDER BY
product,
MIN(date)
Output:
product date_from date_to price
------- ---------- ------------- -----
1 2012-01-01 2012-01-04 1
1 2012-01-05 2012-01-08 2
1 2012-01-09 2012-01-12 1
I'm new to this forum so hope my contribution is helpful.
If you really don't want to use a CTE (although I think thats probably the best approach) you can get a solution using set based code. You will need to test the performance of this code!.
I have added in an extra temp table so that I can use a unique identifier for each record but I suspect you will already have this column in you source table. So heres the temp table.
If Exists (SELECT Name FROM tempdb.sys.tables WHERE name LIKE '#phwithId%')
DROP TABLE #phwithId
CREATE TABLE #phwithId
(
SaleId INT
, ProductID INT
, Price Money
, SaleDate Date
)
INSERT INTO #phwithId SELECT row_number() over(partition by product order by [date] asc) as SalesId, Product, Price, Date FROM ph
Now the main body of the Select statement
SELECT
productId
, date_from
, date_to
, Price
FROM
(
SELECT
dfr.ProductId
, ROW_NUMBER() OVER (PARTITION BY ProductId ORDER BY ChangeDate) AS rowno1
, ChangeDate AS date_from
, dfr.Price
FROM
(
SELECT
sl1.ProductId AS ProductId
, sl1.SaleDate AS ChangeDate
, sl1.price
FROM
#phwithId sl1
LEFT JOIN
#phwithId sl2
ON sl1.SaleId = sl2.SaleId + 1
WHERE
sl1.Price <> sl2.Price OR sl2.Price IS NULL
) dfr
) da1
LEFT JOIN
(
SELECT
ROW_NUMBER() OVER (PARTITION BY ProductId ORDER BY ChangeDate) AS rowno2
, ChangeDate AS date_to
FROM
(
SELECT
sl1.ProductId
, sl1.SaleDate AS ChangeDate
FROM
#phwithId sl1
LEFT JOIN
#phwithId sl3
ON sl1.SaleId = sl3.SaleId - 1
WHERE
sl1.Price <> sl3.Price OR sl3.Price IS NULL
) dto
) da2
ON da1.rowno1 = da2.rowno2
By binding the data source offset by 1 record (+or-) we can identify when the price buckets change and then its just a matter of getting the start and end dates for the buckets back into a single record.
All a bit fiddly and I'm not sure its going to give better performance but I enjoyed the challenge.
WITH marked AS (
SELECT
*,
case
when (lag(price,1,'') over (partition by product order by date_from)) = price
then 0 else 1
end is_price_change
FROM #ph
),
marked_as_group AS
( SELECT m.*,
SUM(is_price_change) over (PARTITION BY product order by date_from ROWS
BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS price_change_group
FROM marked m
),
SELECT
product,
date_from = MIN(date_from),
date_to = MAX(date_to),
price = MIN(price)
FROM marked_as_group
GROUP BY
product,
price_change_group
ORDER BY
product,
date_to
One solution I have come up with which is relatively "clean" is:
;with cte_sort (product, [date], price, [row])
as
(select product, [date], price, row_number() over(partition by product order by [date] asc) as row
from #ph)
select a.product, a.[date] as date_from, c.[date] as date_to, a.price
from cte_sort a
left outer join cte_sort b on a.product = b.product and (a.row+1) = b.row and a.price = b.price
outer apply (select top 1 [date] from cte_sort z where z.product = a.product and z.row > a.row order by z.row) c
where b.row is null
order by a.[date]
I used a CTE with row_number because you then don't need to worry about whether any dates are missing if you use functions like dateadd. You obviously only need the outer apply if you want to have the date_to column (which I do).
This solution does solve my problem, I am however having a slight issue getting it to perform as quickly as I'd like on my table of 5 million rows.
Create function [dbo].[AF_TableColumns](#table_name nvarchar(55))
returns nvarchar(4000) as
begin
declare #str nvarchar(4000)
select #str = cast(rtrim(ltrim(column_name)) as nvarchar(500)) + coalesce(' ' + #str , ' ')
from information_schema.columns
where table_name = #table_name
group by table_name, column_name, ordinal_position
order by ordinal_position DESC
return #str
end
--select dbo.AF_TableColumns('YourTable') Select * from YourTable