How do I remove all but some records based on a threshold?

How do I remove all but some records based on a threshold? - sql

I have a table like this:
CREATE TABLE #TEMP(id int, name varchar(100))
INSERT INTO #TEMP VALUES(1, 'John')
INSERT INTO #TEMP VALUES(1, 'Adam')
INSERT INTO #TEMP VALUES(1, 'Robert')
INSERT INTO #TEMP VALUES(1, 'Copper')
INSERT INTO #TEMP VALUES(1, 'Jumbo')
INSERT INTO #TEMP VALUES(2, 'Jill')
INSERT INTO #TEMP VALUES(2, 'Rocky')
INSERT INTO #TEMP VALUES(2, 'Jack')
INSERT INTO #TEMP VALUES(2, 'Lisa')
INSERT INTO #TEMP VALUES(3, 'Amy')
SELECT *
FROM #TEMP
DROP TABLE #TEMP
I am trying to remove all but some records for those that have more than 3 names with the same id. Therefore, I am trying to get something like this:
id name
1 Adam
1 Copper
1 John
2 Jill
2 Jack
2 Lisa
3 Amy
I am not understanding how to write this query. I have gotten to the extent of preserving one record but not a threshold of records:
;WITH FILTER AS
(
SELECT id
FROM #TEMP
GROUP BY id
HAVING COUNT(id) >=3
)
SELECT id, MAX(name)
FROM #TEMP
WHERE id IN (SELECT * FROM FILTER)
GROUP BY id
UNION
SELECT id, name
FROM #TEMP
WHERE id NOT IN (SELECT * FROM FILTER)
Gives me:
1 Robert
2 Rocky
3 Amy
Any suggestions? Oh by the way, I don't care what records are preserved while merging.

You can do it using CTE
CREATE TABLE #TEMP(id int, name varchar(100))
INSERT INTO #TEMP VALUES(1, 'John')
INSERT INTO #TEMP VALUES(1, 'Adam')
INSERT INTO #TEMP VALUES(1, 'Robert')
INSERT INTO #TEMP VALUES(1, 'Copper')
INSERT INTO #TEMP VALUES(1, 'Jumbo')
INSERT INTO #TEMP VALUES(2, 'Jill')
INSERT INTO #TEMP VALUES(2, 'Rocky')
INSERT INTO #TEMP VALUES(2, 'Jack')
INSERT INTO #TEMP VALUES(2, 'Lisa')
INSERT INTO #TEMP VALUES(3, 'Amy')
SELECT *
FROM #TEMP;
WITH CTE(N) AS
(
SELECT ROW_NUMBER() OVER(PARTITION BY id ORDER BY id)
FROM #Temp
)
DELETE CTE WHERE N>3;
SELECT *
FROM #TEMP;
DROP TABLE #TEMP

I will change your select like this (not tested)
select name from #temp group by name having count(id) > 3
then you can implement your query in a delete statement using your select as a where clause

in inner query you can use row_number function over (partition by id)
and then in outer query you have to give condition like below
select id,name from (
SELECT id,name, row_number() over (partition by id order by 1) count_id FROM #test
group by id, name )
where count_id <=3

If i got your question right, you need to get rows when id occurrence 3 or more times
select t1.name,t1.id from tbl1 t1
inner join tbl1 t2 on t1.id = t2.id
group by t1.name, t1.id
having count(t1.id) > 2

Related

Sql query regarding foreign key dependency

I have two tables, Table A and Table B. Both the table have the "Id" column. Table B is dependent (foreign key) on this "Id". So i want to retrieve the rows which are not present in B.

You seems want :
select a.*
from tablea a
where not exists (select 1 from tableb b where b.id = a.id);

This should work in all flavours of SQL:
select t1.*
from TableA t1
left join TableB t2
on t1.id = t2.id
where t2.id is null

CREATE TABLE TableA
(
ID INT,
[Name] Varchar(500)
)
Insert INTO TableA Values(1, 'James')
Insert INTO TableA Values(2, 'John')
Insert INTO TableA Values(3, 'Betty')
Insert INTO TableA Values(4, 'Sherlin')
CREATE TABLE TableB
(
TableBID INT,
ID INT,
Project Varchar(250)
)
Insert INTO TableB Values(1, 1, 'ABC')
Insert INTO TableB Values(2, 1, 'XYZ')
Insert INTO TableB Values(3, 2 , 'ASD')
Insert INTO TableB Values(4, 1, 'VGF')
Insert INTO TableB Values(5, 3, 'ABC')
Insert INTO TableB Values(6, 3, 'XYZ')
Insert INTO TableB Values(7, 2, 'FGH')
SELECT * FROM TableA a
WHERE exists (SELECT 1 FROM TableB b WHERE b.id = a.id);
OR
SELECT * FROM TABLEA a WHERE ID IN (SELECT ID FROM TableB);
DROP TABLE TABLEA
DROP TABLE TABLEB

How to use DateDiff into only one SELECT statement?

I want to make a short version on my DATEDIFF function on my SQL Query. In my code, I created two temporary tables then there, I select and use the DATEDIFF funtion.
I would want this code to be simplified and only use ONE SELECT statement that will provide the same results. Is it possible?
Here is my result:
This is my SQL Query
DECLARE #Temp TABLE (ID int, Stamp datetime)
INSERT INTO #Temp (ID, Stamp) VALUES (1, '2016-08-17')
INSERT INTO #Temp (ID, Stamp) VALUES (1, GETDATE())
INSERT INTO #Temp (ID, Stamp) VALUES (1, GETDATE()+0.5)
INSERT INTO #Temp (ID, Stamp) VALUES (2, '2016-08-16')
INSERT INTO #Temp (ID, Stamp) VALUES (2, GETDATE())
INSERT INTO #Temp (ID, Stamp) VALUES (2, GETDATE()+3)
SELECT ROW_NUMBER() OVER (ORDER BY ID) as c, ID, Stamp INTO #Temp2
FROM #Temp
SELECT ROW_NUMBER() OVER (ORDER BY ID) as d, ID, Stamp INTO #Temp3
FROM #Temp
SELECT temp2.ID, temp2.Stamp, ISNULL(DATEDIFF(day, temp3.Stamp, temp2.Stamp),0) as DateDiff
FROM #Temp2 as temp2
LEFT JOIN #Temp3 as temp3 on temp2.ID = temp3.ID and temp2.c = temp3.d + 1
Thanks!

If you are using SQL Server 2012:
select * ,isnull(datediff(day,lag(stamp) over(partition by id order by stamp),stamp) ,0)
from #temp t1
Else use this..
;with cte
as
(select * ,row_number() over (partition by id order by stamp ) as rownum
from #temp t1
)
select c1.id,c1.stamp,isnull(datediff(day,c2.stamp,c1.stamp),0) as datee
from cte c1
left join
cte c2
on c1.id=c2.id and c1.rownum=c2.rownum+1

You could remove insert into the temp-tables and use subselects within the final query:
DECLARE #Temp TABLE (ID int, Stamp datetime)
INSERT INTO #Temp (ID, Stamp) VALUES (1, '2016-08-17')
INSERT INTO #Temp (ID, Stamp) VALUES (1, GETDATE())
INSERT INTO #Temp (ID, Stamp) VALUES (1, GETDATE()+0.5)
INSERT INTO #Temp (ID, Stamp) VALUES (2, '2016-08-16')
INSERT INTO #Temp (ID, Stamp) VALUES (2, GETDATE())
INSERT INTO #Temp (ID, Stamp) VALUES (2, GETDATE()+3)
SELECT temp2.ID, temp2.Stamp, ISNULL(DATEDIFF(day, temp3.Stamp, temp2.Stamp),0) as DateDiff
FROM (SELECT ROW_NUMBER() OVER (ORDER BY ID) as c, ID, Stamp FROM #Temp) as temp2
LEFT JOIN (SELECT ROW_NUMBER() OVER (ORDER BY ID) as d, ID, Stamp FROM #Temp) as temp3
on temp2.ID = temp3.ID and temp2.c = temp3.d + 1

In SQL Server 2012+, you would just use lag():
select t.*
isnull(datediff(day, lag(stamp) over (partition by id order by stamp), stamp), 0)
from #temp t;
In earlier versions, I would use outer apply:
select t.*,
isnull(datediff(day, t2.stamp, t.stamp), 0)
from #temp t outer apply
(select top 1 t2.*
from #temp t2
where t2.id = t.id and t2.stamp < t.stamp
order by t2.stamp desc
) t2;

try a cte,
DECLARE #Temp TABLE (ID int, Stamp datetime)
INSERT INTO #Temp (ID, Stamp) VALUES (1, '2016-08-17')
INSERT INTO #Temp (ID, Stamp) VALUES (1, GETDATE())
INSERT INTO #Temp (ID, Stamp) VALUES (1, GETDATE()+0.5)
INSERT INTO #Temp (ID, Stamp) VALUES (2, '2016-08-16')
INSERT INTO #Temp (ID, Stamp) VALUES (2, GETDATE())
INSERT INTO #Temp (ID, Stamp) VALUES (2, GETDATE()+3)
;WITH CTE AS
(
SELECT ROW_NUMBER() OVER (ORDER BY ID) as RowNo, ID, Stamp
FROM #Temp
)
SELECT temp2.ID, temp2.Stamp, ISNULL(DATEDIFF(day, temp3.Stamp, temp2.Stamp),0) as DateDiff
FROM CTE as temp2
LEFT JOIN CTE as temp3 on temp2.ID = temp3.ID
AND temp2.RowNo = temp3.RowNo + 1

Select top dates grouped by ID's

I have a table as follows:
DECLARE #tmp TABLE
(
userID int,
testID int,
someDate datetime
)
Within it I store dates along with two ID values, e.g.
INSERT INTO #tmp (userID, testID, someDate) VALUES (1, 50, '2010-10-01')
INSERT INTO #tmp (userID, testID, someDate) VALUES (1, 50, '2010-11-01')
INSERT INTO #tmp (userID, testID, someDate) VALUES (1, 50, '2010-12-01')
INSERT INTO #tmp (userID, testID, someDate) VALUES (2, 20, '2010-10-01')
INSERT INTO #tmp (userID, testID, someDate) VALUES (2, 20, '2010-11-01')
I need to select the latest date per userID/testID combination. So, the result would be
userID testID someDate
1 50 2010-12-01
2 20 2010-11-01
It sounds really easy but I can't figure it out. SQL Fiddle Here.

SELECT userID, testID, MAX(someDate)
FROM #tmp
GROUP BY testId,userID;
fiddle

Try
SELECT t1.* FROM #tmp t1
INNER JOIN (SELECT userId, MAX(someDate) someDate
FROM #tmp
GROUP BY userId) t2
ON t1.userId = t2.userId
AND t1.someDate = t2.someDate

SELECT userId, testId, MAX(someDate)
FROM #tmp
GROUP BY testId, userId
http://www.sqlfiddle.com/#!6/d41d8/5205

Merging records based on a time difference?

I have the following table:
CREATE TABLE #TEMP (id int, name varchar(255), startdate datetime, enddate datetime)
INSERT INTO #TEMP VALUES(1, 'John', '2011-01-11 00:00:00.000','2011-01-11 00:01:10.000')
INSERT INTO #TEMP VALUES(2, 'John', '2011-01-11 00:00:20.000','2011-01-11 00:01:50.000')
INSERT INTO #TEMP VALUES(3, 'John', '2011-01-11 00:01:40.000','2011-01-11 00:01:50.000')
INSERT INTO #TEMP VALUES(4, 'Adam', '2011-01-11 00:00:40.000','2011-01-11 00:01:20.000')
INSERT INTO #TEMP VALUES(5, 'Adam', '2011-01-11 00:00:10.000','2011-01-11 00:01:30.000')
SELECT * FROM #TEMP
DROP TABLE #TEMP
I am trying to merge all records with the same name within a range of 60 seconds to each other to get the following:
John 2011-01-11 00:00:00.000 2011-01-11 00:01:10.000
John 2011-01-11 00:01:40.000 2011-01-11 00:01:50.000
Adam 2011-01-11 00:00:10.000 2011-01-11 00:01:20.000
Any suggestions on how to do this on a table with about 50K records? Currently, I managed to get to this:
SELECT * FROM #TEMP
CREATE TABLE #Merge(id1 int, id2 int)
INSERT INTO #Merge
SELECT id, uuid
FROM
(
SELECT t.id, u.uuid, t.name, t.startdate, t.enddate, u.ustartdate, u.uenddate,
(CASE WHEN (DATEDIFF(second, t.startdate, u.ustartdate) <= 60 AND DATEDIFF(second, t.startdate, u.ustartdate) >= 0) then 1 else 0 END) Flag
FROM #Temp t
INNER JOIN
(SELECT id AS uuid, name, startdate AS ustartdate, enddate AS uenddate
FROM #Temp) u
ON t.name = u.name AND t.startdate != u.ustartdate AND t.id != u.uuid
) w
WHERE Flag = 1
SELECT * FROM #Merge
-- Insert non-mergable records
CREATE TABLE #TEMP2 (id int, name varchar(255), membergroup varchar(255), startdate datetime, enddate datetime)
INSERT INTO #TEMP2
SELECT * FROM #TEMP
WHERE id NOT IN (SELECT id1 FROM #Merge UNION SELECT id2 FROM #Merge)
SELECT * FROM #TEMP2
Of course, I am not sure how to proceed from here. The #Merge table gives me rows that are to be merged. What I did was to insert non-mergable rows first into #Temp2 first.
EDIT:
Updated set of rows, just in case:
INSERT INTO #TEMP VALUES(1, 'John', 'A', '2011-01-11 00:00:00.000','2011-01-11 00:01:10.000')
INSERT INTO #TEMP VALUES(2, 'John', 'A', '2011-01-11 00:00:01.000','2011-01-11 00:01:10.000')
INSERT INTO #TEMP VALUES(3, 'John', 'B', '2011-01-11 00:00:20.000','2011-01-11 00:01:50.000')
INSERT INTO #TEMP VALUES(4, 'John', 'C', '2011-01-11 00:01:40.000','2011-01-11 00:01:50.000')
INSERT INTO #TEMP VALUES(5, 'John', 'C', '2011-01-11 00:01:50.000','2011-01-11 00:02:20.000')
INSERT INTO #TEMP VALUES(6, 'Adam', 'A', '2011-01-11 00:00:40.000','2011-01-11 00:01:20.000')
INSERT INTO #TEMP VALUES(7, 'Adam', 'B', '2011-01-11 00:00:10.000','2011-01-11 00:01:30.000')
INSERT INTO #TEMP VALUES(8, 'Adam', 'B', '2011-01-11 00:03:10.000','2011-01-11 00:04:30.000')

The code below manage's to show both merged rows (rows 1-2,4-5) and unique rows (row 3)
SELECT DISTINCT a.id,a.name,a.startdate,a.enddate
FROM temp a
LEFT JOIN temp b ON a.name = b.name AND a.id < b.id AND DATEDIFF(s,a.startdate,b.startdate)<=60
LEFT JOIN temp c ON c.name = a.name AND c.id < a.id AND DATEDIFF(s,c.startdate,a.startdate)<=60
WHERE (b.id IS NOT NULL OR c.id IS NULL) AND a.id <= COALESCE(c.id,a.id)

Given you haven't said how to use the 60 second interval and your sample code showed only a startdate comparison, here you go
SELECT
*
FROM
#Temp t1
CROSS APPLY
(SELECT TOP 1*
FROM #Temp t2
WHERE t1.name = t2.name AND DATEDIFF(second, t1.startdate, t2.startdate) < 60 AND t1.id < t2.id
ORDER BY id DESC
) t2x
Based on startdate only, row pairs 1/2 and 4/5 make it into the output. Row 3 doesn't so you'll have to explain why you added it.
That is, row id = 3 is not within 60 seconds of row 1 or 2 based on startdate. So it shouldn't be in the output.
This assumes that id and startdate are both increasing.
Edit, after chat:
SELECT
*
FROM
#Temp t1
CROSS APPLY
(SELECT TOP 1 *
FROM #Temp t2
WHERE t1.name = t2.name AND DATEDIFF(second, t1.startdate, t2.startdate) < 60 AND t1.id < t2.id
ORDER BY t2.id DESC
) t2x
UNION ALL
SELECT
t1.*, t1.*
FROM
#Temp t1
WHERE NOT EXISTS
(
SELECT
t1ZZ.id, t2xZZ.id
FROM
#Temp t1ZZ
CROSS APPLY
(SELECT TOP 1 *
FROM #Temp t2ZZ
WHERE t1ZZ.name = t2ZZ.name AND DATEDIFF(second, t1ZZ.startdate, t2ZZ.startdate) < 60 AND t1ZZ.id < t2ZZ.id
ORDER BY t2ZZ.id DESC
) t2xZZ
WHERE
t1.id IN (t1ZZ.id, t2xZZ.id)
)

SQL Statement(s)

If I have the following table:
CREATE TABLE #temp (
id int,
num int,
question varchar(50),
qversion int );
INSERT INTO #temp VALUES(1, 1, 'Question 1 v1', 1);
INSERT INTO #temp VALUES(2, 1, 'Question 1 v2', 2);
INSERT INTO #temp VALUES(3, 2, 'Question 2 v1', 1);
INSERT INTO #temp VALUES(4, 2, 'Question 2 v2', 2);
INSERT INTO #temp VALUES(5, 2, 'Question 2 v3', 3);
INSERT INTO #temp VALUES(6, 3, 'Question 3 v1', 1);
SELECT *
FROM #temp;
DROP TABLE #temp;
And I would like to get a table to display the three questions in their lastest version? This is in SQL Server 2005

CREATE TABLE #temp (
id int,
num int,
question varchar(50),
qversion int );
INSERT INTO #temp VALUES(1, 1, 'Question 1 v1', 1);
INSERT INTO #temp VALUES(2, 1, 'Question 1 v2', 2);
INSERT INTO #temp VALUES(3, 2, 'Question 2 v1', 1);
INSERT INTO #temp VALUES(4, 2, 'Question 2 v2', 2);
INSERT INTO #temp VALUES(5, 2, 'Question 2 v3', 3);
INSERT INTO #temp VALUES(6, 3, 'Question 3 v1', 1);
WITH latest AS (
SELECT num, MAX(qversion) AS qversion
FROM #temp
GROUP BY num
)
SELECT #temp.*
FROM #temp
INNER JOIN latest
ON latest.num = #temp.num
AND latest.qversion = #temp.qversion;
DROP TABLE #temp;

SELECT t1.id, t1.num, t1.question, t1.qversion
FROM #temp t1
LEFT OUTER JOIN #temp t2
ON (t1.num = t2.num AND t1.qversion < t2.qversion)
GROUP BY t1.id, t1.num, t1.question, t1.qversion
HAVING COUNT(*) < 3;

You're using SQL Server 2005, so it's worth at least exploring the over clause:
select
*
from
(select *, max(qversion) over (partition by num) as maxVersion from #temp) s
where
s.qversion = s.maxVersion

I would like to get a table to display the three latest versions of each question.
I assume that that qversion is increasing with time. If this assumption is backwards, remove the desc keyword from the answer.
The table definition does not have an explicit not null constraint on qversion. I assume that a null qversion should be excluded. (Note: Depending on settings, lack of an explicit null/not null in the declaration may result in a not null constraint.) If the table does have a not null contraint, than the text where qversion is not null should be removed. If qversion can be null, and nulls need to be included in the result set, then additional changes will need to be made.
CREATE TABLE #temp (
id int,
num int,
question varchar(50),
qversion int );
INSERT INTO #temp VALUES(1, 1, 'Question 1 v1', 1);
INSERT INTO #temp VALUES(2, 1, 'Question 1 v2', 2);
INSERT INTO #temp VALUES(3, 2, 'Question 2 v1', 1);
INSERT INTO #temp VALUES(4, 2, 'Question 2 v2', 2);
INSERT INTO #temp VALUES(5, 2, 'Question 2 v3', 3);
INSERT INTO #temp VALUES(7, 2, 'Question 2 v4', 4);
-- ^^ Added so at least one row would be excluded.
INSERT INTO #temp VALUES(6, 3, 'Question 3 v1', 1);
INSERT INTO #temp VALUES(8, 4, 'Question 4 v?', null);
select id, num, question, qversion
from (select *,
row_number() over (partition by num order by qversion desc) as RN
from #temp
where qversion is not null) T
where RN <= 3

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

How do I remove all but some records based on a threshold? - sql

I will change your select like this (not tested) select name from #temp group by name having count(id) > 3 then you can implement your query in a delete statement using your select as a where clause

in inner query you can use row_number function over (partition by id) and then in outer query you have to give condition like below select id,name from ( SELECT id,name, row_number() over (partition by id order by 1) count_id FROM #test group by id, name ) where count_id <=3

If i got your question right, you need to get rows when id occurrence 3 or more times select t1.name,t1.id from tbl1 t1 inner join tbl1 t2 on t1.id = t2.id group by t1.name, t1.id having count(t1.id) > 2

Related

Sql query regarding foreign key dependency

How to use DateDiff into only one SELECT statement?

Select top dates grouped by ID's

Merging records based on a time difference?

SQL Statement(s)

Categories

Resources