SQL remove duplicates at ID and Month level

SQL remove duplicates at ID and Month level - sql

I have a table that is something like this:
ID Date Name Age
1 10/04/2015 Theja 24
1 28/04/2015 Theja1 26
1 14/07/2015 Theja2 45
1 30/07/2015 Theja2 45
1 30/08/2015 Theja3 54
2 10/04/2016 Jaya 23
2 28/04/2016 Jaya 23
2 14/05/2016 Jaya1 65
2 30/05/2016 Jaya1 65
But i want output like:
ID Date Name Age
1 28/04/2015 Theja1 26
1 01/05/2015 Theja1 26
1 01/06/2015 Theja1 26
1 30/07/2015 Theja2 45
1 30/08/2015 Theja3 54
2 28/04/2016 Jaya 23
2 30/05/2016 Jaya1 65
Consider 1 record per each month which is max and if any missing months for ID then consider previous records fill for missing months.

Different databases have different methods for handling dates. The following is an ANSI-standard way of getting one row per month:
select id, min(date)
from t
group by id,
extract(year from date), extract(month from date);

I have tried for a solution and came with the following, but you need a calendar table to insert missing rows in the output.
SQL Server Based solution given here
Data Setup:
create table temptable (
id int,
[date] date,
name varchar (50),
age int
);
insert into temptable values
(1,'04-10-2015','Theja',24)
insert into temptable values
(1,'04-28-2015','Theja1',26)
insert into temptable values
(1,'07-14-2015','Theja2',45)
insert into temptable values
(1,'07-30-2015','Theja2',45)
insert into temptable values
(1,'08-30-2015','Theja3',54)
insert into temptable values
(2,'04-10-2016','Jaya',23)
insert into temptable values
(2,'04-28-2016','Jaya',23)
insert into temptable values
(2,'05-14-2016','Jaya1',65)
insert into temptable values
(2,'05-30-2016','Jaya1',65)
The following solution completes till duplicate issue. but to get missing rows you need to implement the calendar table.
you can join with the calendar table and then use output of the cte3 to get missing data.
with cte1 as (
select *,
row_number() over ( partition by month([date]) order by [date]) as rownm,
concat(id,format([date],'MMyyyy')) as unqcol
from temptable
) , cte2 as
(
select unqcol, max(rownm) as maxdt
from cte1
group by unqcol
), cte3 as
( select a.*, lead(a.[date]) over (partition by a.id order by a.id,a.[date]) as NextDate from
cte1 a inner join cte2 b
on a.unqcol=b.unqcol and a.rownm=b.maxdt
)
select c.id,c.[date],c.name,c.age,c.NextDate from cte3 c
order by c.[date]

Related

how to fetch year on year data based on rule

I have a table with id and date I need two outputs as below. I will pass an input date and I need records according to below conditions
output 1: I need less than or equal date data either it is month end or not for the first row and first-row date-1 year month end data and first-row date-2 year month end data and if data is not available for the particular year then return null on that row.
output 2: Same as first but with a twist that I need whatever data is available on first row date-1 and -2 I do not need null.
declare #tbl table (id int , marketdate date )
insert into #tbl (id,marketdate)
values (1,'2018-05-31'),
(1,'2017-05-29'),
(1,'2016-05-31'),
(2,'2018-02-28'),
(2,'2017-02-28'),
(2,'2016-02-29'),
(2,'2016-02-28')
My query :
;with cte as (
select id , marketdate
from (
select id , marketdate ,row_number() over(partition by id order by marketdate desc) rn
from #tbl
where marketdate <='2018-06-05'
) a where rn=1
union all
select id , marketdate
from (
select b.id , b.marketdate ,row_number() over(partition by b.id order by b.marketdate desc) rn
from #tbl b inner join cte c
on b.id= c.id
where b.marketdate<= dateadd(year,-1,c.marketdate )
) b where rn=1
)
select * from cte
order by id, marketdate desc
output 1:
1 2018-05-31
1 NUll
1 2016-05-31
2 2018-02-28
2 2017-02-28
2 2016-02-29
output 2:
1 2018-05-31
1 2017-05-29
1 2016-05-31
2 2018-02-28
2 2017-02-28
2 2016-02-29
Please help.

SQL get consecutive days ignoring weekend

I have a table with following format:
ID ID1 ID2 DATE
1 1 1 2018-03-01
2 1 1 2018-03-02
3 1 1 2018-03-05
4 1 1 2018-03-06
5 1 1 2018-03-07
6 2 2 2018-03-05
7 2 2 2018-03-05
8 2 2 2018-03-06
9 2 2 2018-03-07
10 2 2 2018-03-08
From this table I have to get all records where ID1 and ID2 are the same in that column and where DATE is 5 consecutive work days (5 dates in a row, ignoring missing dates for Saturday/Sunday; ignore holidays).
I have really no idea how to achieve this. I did search around, but couldn't find anything that helped me. So my question is, how can I achieve following output?
ID ID1 ID2 DATE
1 1 1 2018-03-01
2 1 1 2018-03-02
3 1 1 2018-03-05
4 1 1 2018-03-06
5 1 1 2018-03-07
SQLFiddle to mess around

Assuming you have no duplicates and work is only on weekdays, then there is a simplish solution for this particular case. We can identify the date 4 rows ahead. For a complete week, it is either 4 days ahead or 6 days ahead:
select t.*
from (select t.*, lead(dat, 4) over (order by id2, dat) as dat_4
from t
) t
where datediff(day, dat, dat_4) in (4, 6);
This happens to work because you are looking for a complete week.
Here is the SQL Fiddle.

select t.* from
(select id1,id2,count(distinct dat) count from t
group by id1,id2
having count(distinct dat)=5) t1 right join
t
on t.id1=t1.id1 and t.id2=t1.id2
where count=5
Check this-
Dates of Two weeks with 10 valid dates
http://sqlfiddle.com/#!18/76556/1
Dates of Two weeks with 10 non-unique dates
http://sqlfiddle.com/#!18/b4299/1
and
Dates of Two weeks with less than 10 but unique
http://sqlfiddle.com/#!18/f16cb/1

This query is very verbose without LEAD or LAG and it is the best I could do on my lunch break. You can probably improve on it given the time.
DECLARE #T TABLE
(
ID INT,
ID1 INT,
ID2 INT,
TheDate DATETIME
)
INSERT #T SELECT 1,1,1,'03/01/2018'
INSERT #T SELECT 2,1,1,'03/02/2018'
INSERT #T SELECT 3,1,1,'03/05/2018'
INSERT #T SELECT 4,1,1,'03/06/2018'
INSERT #T SELECT 5,1,1,'03/07/2018'
--INSERT #T SELECT 5,1,1,'03/09/2018'
INSERT #T SELECT 6,2,2,'03/02/2018'
INSERT #T SELECT 7,2,2,'03/05/2018'
INSERT #T SELECT 8,2,2,'03/05/2018'
--INSERT #T SELECT 9,2,2,'03/06/2018'
INSERT #T SELECT 10,2,2,'03/07/2018'
INSERT #T SELECT 11,2,2,'03/08/2018'
INSERT #T SELECT 12,2,2,'03/15/2018'
INSERT #T SELECT 13,1,1,'04/01/2018'
INSERT #T SELECT 14,1,1,'04/02/2018'
INSERT #T SELECT 15,1,1,'04/05/2018'
--SELECT * FROM #T
DECLARE #LowDate DATETIME = DATEADD(DAY,-1,(SELECT MIN(TheDate) FROM #T))
DECLARE #HighDate DATETIME = DATEADD(DAY,1,(SELECT MAX(TheDate) FROM #T))
DECLARE #DaysThreshold INT = 5
;
WITH Dates AS
(
SELECT DateValue=#LowDate
UNION ALL
SELECT DateValue + 1 FROM Dates
WHERE DateValue + 1 < #HighDate
),
Joined AS
(
SELECT * FROM Dates LEFT OUTER JOIN #T T ON T.TheDate=Dates.DateValue
),
Calculations AS
(
SELECT
ID=MAX(J1.ID),
J1.ID1,J1.ID2,
J1.TheDate,
LastDate=MAX(J2.TheDate),
LastDateWasWeekend = CASE WHEN ((DATEPART(DW,DATEADD(DAY,-1,J1.TheDate) ) + ##DATEFIRST) % 7) NOT IN (0, 1) THEN 0 ELSE 1 END,
Offset = DATEDIFF(DAY,MAX(J2.TheDate),J1.TheDate)
FROM
Joined J1
LEFT OUTER JOIN Joined J2 ON J2.ID1=J1.ID1 AND J2.ID2=J1.ID2 AND J2.TheDate<J1.TheDate
WHERE
NOT J1.ID IS NULL
GROUP BY J1.ID1,J1.ID2,J1.TheDate
)
,FindValid AS
(
SELECT
ID,ID1,ID2,TheDate,
IsValid=CASE
WHEN LastDate=TheDate THEN 0
WHEN LastDate IS NULL THEN 1
WHEN Offset=1 THEN 1
WHEN Offset>3 THEN 0
WHEN Offset<=3 THEN
LastDateWasWeekend
END
FROM
Calculations
UNION
SELECT DISTINCT ID=NULL,ID1,ID2, TheDate=#HighDate,IsValid=0 FROM #T
),
FindMax As
(
SELECT
This.ID,This.ID1,This.ID2,This.TheDate,MaxRange=MIN(Next.TheDate)
FROM
FindValid This
LEFT OUTER JOIN FindValid Next ON Next.ID2=This.ID2 AND Next.ID1=This.ID1 AND This.TheDate<Next.TheDate AND Next.IsValid=0
GROUP BY
This.ID,This.ID1,This.ID2,This.TheDate
),
FindMin AS
(
SELECT
This.ID,This.ID1,This.ID2,This.TheDate,This.MaxRange,MinRange=MIN(Next.TheDate)
FROM
FindMax This
LEFT OUTER JOIN FindMax Next ON Next.ID2=This.ID2 AND Next.ID1=This.ID1 AND This.TheDate<Next.MaxRange-- AND Next.IsValid=0 OR Next.TheDate IS NULL
GROUP BY
This.ID,This.ID1,This.ID2,This.TheDate,This.MaxRange
)
,Final AS
(
SELECT
ID1,ID2,MinRange,MaxRange,SequentialCount=COUNT(*)
FROM
FindMin
GROUP BY
ID1,ID2,MinRange,MaxRange
)
SELECT
T.ID,
T.ID1,
T.ID2,
T.TheDate
FROM #T T
INNER JOIN Final ON T.TheDate>= Final.MinRange AND T.TheDate < Final.MaxRange AND T.ID1=Final.ID1 AND T.ID2=Final.ID2
WHERE
SequentialCount>=#DaysThreshold
OPTION (MAXRECURSION 0)

How to remove duplicate ID rows. While removing, use the rows that has NULL value in another column

While removing duplicate rows with same ID value, how to remove the rows that has null value in one particular column.
Note: there are other non-duplicate rows (below e.g., 12) that has NULL value and should still get selected in the result set.
Input table:
Id | sale_date | price
-----------------------------
11 20051020 22.1
11 NULL 20.1
12 NULL 20.1
13 20051020 20.1
Expected result:
Id | sale_date | price
-----------------------------
11 20051020 22.1
12 NULL 20.1
13 20051020 20.1

Assuming you have SQL Server 2008 or above, this will work for you. I use row_number and assign the values by ID starting at the max date. So any value higher than 1 is lower than the max date for that particular ID so I delete row_num greater than 1.
Check it out:
DECLARE #yourTable TABLE (ID INT,Sale_date DATE, Price FLOAT);
INSERT INTO #yourTable
VALUES (11,'20051020',22.1),
(11,NULL,20.1),
(12,NULL,20.1),
(13,'20051020',20.1);
WITH CTE
AS
(
SELECT *,
ROW_NUMBER() OVER (PARTITION BY ID ORDER BY sale_date DESC) AS row_num
FROM #yourTable
)
DELETE
FROM CTE
WHERE row_num > 1
SELECT *
FROM #yourTable

Try this
SELECT * FROM (
SELECT *, ROW_NUMBER() OVER (PARTITION BY ID ORDER BY Sale_Date desc) AS ROW_NUM FROM AA1) A
WHERE ROW_NUM<2

If you have duplicate Ids, and not every sale_date is NULL, you can keep the latest date, and delete the other ones:
DELETE #MyTable
FROM #MyTable AS T1
INNER JOIN (SELECT Id, MAX(sale_date) AS sale_date FROM #MyTable GROUP BY Id HAVING COUNT(*) > 1)
AS T2 ON T1.ID = T2.ID AND (T1.sale_date is null OR T1.sale_date < T2.sale_date)

How to maintain cumulative sum for each User in SQL server

I had a table like
ID UserID rupees time
1 1 200 2014-01-05
---------------------------------
2 1 500 2014-04-06
----------------------------------
3 2 10 2014-05-05
----------------------------------
4 2 20 2014-05-06
----------------------------------
I want the output lie
ID UserID Rupees time CumulativeSum
1 1 200 2014-01-05 200
-------------------------------------------------
2 1 500 2014-04-06 700
-------------------------------------------------
3 2 10 2014-05-06 10
-------------------------------------------------
4 2 20 2014-05-06 30
---------------------------------------------------
How can i get this table as purput

Please try using CTE:
;With T as(
select
*,
ROW_NUMBER() over(partition by UserId order by [time]) RN
from tbl
)
select
UserID,
rupees,
[time],
(select SUM(rupees)
from T b
where b.UserID=a.UserID and b.RN<=a.RN) CumulativeSum
from T a
For records with column value time increasing, try the below query:
select
UserID,
rupees,
[time],
(select SUM(rupees)
from tbl b
where b.UserID=a.UserID and b.[time]<=a.[time]) CumulativeSum
from tbl a

For SQL Server 2012 or later, you can use SUM() with an OVER clause that specifies a ROW clause:
declare #t table (ID int,UserID int,rupees int,[time] date)
insert into #t(ID,UserID,rupees,[time]) values
(1,1,200,'20140105'),
(2,1,500,'20140406'),
(3,2, 10,'20140505'),
(4,2, 20,'20140506')
select
*,
SUM(rupees) OVER (
PARTITION BY UserID
ORDER BY id /* or time? */
ROWS BETWEEN
UNBOUNDED PRECEDING AND
CURRENT ROW)
as total
from #t
Result:
ID UserID rupees time total
----------- ----------- ----------- ---------- -----------
1 1 200 2014-01-05 200
2 1 500 2014-04-06 700
3 2 10 2014-05-05 10
4 2 20 2014-05-06 30

DECLARE #t table (UserID INT,rupees INT,DateKey Date )
INSERT INTO #t VALUES
(1,200,'2014-01-05'),
(2,300,'2014-01-06'),
(2,800,'2014-03-06')
select UserID,
rupees,
DateKey,
(SELECT SUM(rupees)from #t t
where t.rupees <= tt.rupees) from #t tt
GROUP BY UserID,rupees,DateKey

Hope this too helps you.
DECLARE #tab TABLE (id INT,userId INT,rupees INT,[time] Date)
INSERT INTO #tab VALUES
(1,1,200 ,'2014-01-05'),
(2,1,500 ,'2014-04-06'),
(3,2,10 ,'2014-05-05'),
(4,2,20 ,'2014-05-06')
SELECT LU.id,LU.userId,LU.rupees,LU.time,SUM(b.rupees) CumulativeSum
FROM (SELECT *,ROW_NUMBER() OVER (PARTITION BY userId ORDER BY [time]) R FROM #tab) B
JOIN (SELECT *,ROW_NUMBER() OVER (PARTITION BY userId ORDER BY [time]) R FROM #tab) LU
ON B.userId = LU.userId AND B.R <= LU.R
GROUP BY LU.id,LU.userId,LU.rupees,LU.time
Result

I am assuming that you are not using SQL Server 2012, which provides the cumulative sum function. The other answers use some form of the row_number() function, but these seems totally unnecessary. I usually approach cumulative sums using correlated subqueries:
select ID, UserID, rupees, [time],
(select sum(rupees)
from table t2
where t2.UserId = t.UserId and
t2.ID <= t.ID
) as CumulativeSum
from table t;
This requires having a column that uniquely identifies each row, and that seems to be the purpose of id. For performance, I would want to have an index on table(UserId, ID, rupees).

select *, SUM(rupees) OVER (
PARTITION BY UserID
ORDER BY id) as CumSum from #tbl

Query to return first date of missing date ranges

Looking for help with a query using SQL 2008 R2... I have a table with client and date fields. Most clients have a record for most dates, however some don't.
For example I have this data:
CLIENTID DT
1 5/1/14
1 5/2/14
2 5/3/14
3 5/1/14
3 5/2/14
I can find the missing dates for each CLIENTID by creating a temp table with all possible dates for the period and then joining that to each CLIENTID and DT and only selecting where there is a NULL.
This is what I can get easily for the date range 5/1/14 to 5/4/14:
CLIENTID DTMISSED
1 5/3/14
1 5/4/14
2 5/1/14
2 5/2/14
2 5/4/14
3 5/3/14
3 5/4/14
However I want to group each consecutive missed period together and get the beginning of each period and the length.
For example, if I use the date range 5/1/14 to 5/4/14 I'd like to get:
CLIENTID DTSTART MISSED
1 5/3/14 2
2 5/1/14 2
2 5/4/14 1
3 5/3/14 2
Thanks for helping!

It's fascinating how more elegantly and also mere efficiently this kind of problems can be solved in 2012.
First, the tables:
create table #t (CLIENTID int, DT date)
go
insert #t values
(1, '5/1/14'),
(1, '5/2/14'),
(2, '5/3/14'),
(3, '5/1/14'),
(3, '5/2/14')
go
create table #calendar (dt date)
go
insert #calendar values ('5/1/14'),('5/2/14'),('5/3/14'),('5/4/14')
go
Here's the 2008 solution:
;with x as (
select *, row_number() over(order by clientid, dt) as rn
from #calendar c
cross join (select distinct clientid from #t) x
where not exists (select * from #t where c.dt=#t.dt and x.clientid=#t.clientid)
),
y as (
select x1.*, x2.dt as x2_dt, x2.clientid as x2_clientid
from x x1
left join x x2 on x1.clientid=x2.clientid and x1.dt=dateadd(day,1,x2.dt)
),
z as (
select *, (select sum(case when x2_dt is null then 1 else 0 end) from y y2 where y2.rn<=y.rn) as grp
from y
)
select clientid, min(dt), count(*)
from z
group by clientid, grp
order by clientid
Compare it to 2012:
;with x as (
select *, row_number() over(order by dt) as rn
from #calendar c
cross join (select distinct clientid from #t) x
where not exists (select * from #t where c.dt=#t.dt and x.clientid=#t.clientid)
),
y as (
select x1.*, sum(case when x2.dt is null then 1 else 0 end) over(order by x1.clientid,x1.dt) as grp
from x x1
left join x x2 on x1.clientid=x2.clientid and x1.dt=dateadd(day,1,x2.dt)
)
select clientid, min(dt), count(*)
from y
group by clientid, grp
order by clientid

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

SQL remove duplicates at ID and Month level - sql

Different databases have different methods for handling dates. The following is an ANSI-standard way of getting one row per month: select id, min(date) from t group by id, extract(year from date), extract(month from date);

Related

how to fetch year on year data based on rule

SQL get consecutive days ignoring weekend

How to remove duplicate ID rows. While removing, use the rows that has NULL value in another column

How to maintain cumulative sum for each User in SQL server

Query to return first date of missing date ranges

Categories

Resources