how to insert many records excluding some - sql

I want to create a table with a subset of records from a master table.
for example, i have:
id name code
1 peter 73
2 carl 84
3 jack 73
I want to store peter and carl but not jack because has same peter's code.
I need hight performance because i have 20M records.
I try this:
SELECT id, name, DISTINCT(code) INTO new_tab
FROM old_tab
WHERE (conditions)
but don't work.

Assuming you want to pick the row with the maximum id per code, then this should do it:
insert into new_tab (id, name, code)
(SELECT id, name, code
FROM
(
SELECT id, name, code, rank() as rnk OVER (PARTITION BY code ORDER BY id DESC)
FROM old_tab WHERE rnk = 1
)
)
and for the minimum id per code, just change the sort order in the rank from DESC to ASC:
insert into new_tab (id, name, code)
(SELECT id, name, code
FROM
(
SELECT id, name, code, rank() as rnk OVER (PARTITION BY code ORDER BY id ASC)
FROM old_tab WHERE rnk = 1
)
)

Using a derived table, you can find the minID for each code, then join back to that in the outer to get the rest of the columns for that ID from oldTab.
select id,name,code
insert into newTabFROM
from old_tab t inner join
(SELECT min(id) as minId, code
from old_tab group by code) x
on t.id = x.minId
WHERE (conditions)

Try this:
CREATE TABLE #Temp
(
ID INT,
Name VARCHAR(50),
Code INT
)
INSERT #Temp VALUES (1, 'Peter', 73)
INSERT #Temp VALUES (2, 'Carl', 84)
INSERT #Temp VALUES (3, 'Jack', 73)
SELECT t2.ID, t2.Name, t2.Code
FROM #Temp t2
JOIN (
SELECT t.Code, MIN(t.ID) ID
FROM #temp t
JOIN (
SELECT DISTINCT Code
FROM #Temp
) d
ON t.Code = d.Code
GROUP BY t.Code
) b
ON t2.ID = b.ID

Related

Select unique field

I have this table:
TableA
----------------
ID (pk) Name
1 A
2 B
3 C
4 A
5 D
6 A
7 B
8 A
9 D
10 C
....
I need to randomly extract with a SELECT TOP 5 ID, Name FROM TableA
with Name that must be unique within the 5 records.
I'm trying :
;WITH group
AS
(
SELECT ID, Name,
ROW_NUMBER() OVER (PARTITION BY Name ORDER BY NewId()) rn
FROM TableA
)
SELECT ID, Name
FROM group
WHERE rn = 1
but every time I have quite the same results.
I need to select between all the values for ID at random, assuring that Name will always be different for each record.
I hope the problem is understandable. Any ideas?
Found a solution. It seems to work!
;WITH group
AS (
SELECT ID, Name, ROW_NUMBER() OVER (PARTITION BY Name ORDER BY NewId()) rn FROM TableA )
SELECT top 5 ID, Name, NewId() [NewId]
FROM group
WHERE rn = 1
ORDER BY [newid]
Perhaps the problem is that although newid() is random, it may tend to be sequential. Does this fix the problem?
WITH g as (
SELECT ID, Name,
ROW_NUMBER() OVER (PARTITION BY Name ORDER BY RAND(CHECKSUM(NewId()))) as rn
FROM TableA
)
SELECT ID, Name
FROM g
WHERE rn = 1;
CREATE TABLE #test(ID INT ,Name VARCHAR(1)) INSERT INTO #test(ID ,Name )
SELECT 1,'A' UNION ALL SELECT 2,'B' UNION ALL SELECT 3,'C' UNION ALL
SELECT 4,'A' UNION ALL SELECT 5,'D'UNION ALL SELECT 6,'A' UNION ALL
SELECT 7,'B' UNION ALL SELECT 8,'A'UNION ALL SELECT 9,'D' UNION ALL
SELECT 10,'C'
SELECT T1.ID ,T1.Name FROM #test T1
JOIN ( SELECT TOP 5 Name FROM #test T2 ORDER BY NEWID()
) A ON T1.Name = A.Name ORDER BY A.Name
;WITH group
AS
(
SELECT ID, Name,
ROW_NUMBER() OVER (PARTITION BY Name ORDER BY NewId()) rn
FROM TableA
)
SELECT top 5 ID, Name, NewId() [NewId]
FROM group
WHERE rn = 1
ORDER BY [newid]

Get the maximum values of column B per each distinct value of column A sql

I have this table:
I am trying to pull all records from this table for the max value in the DIST_NO column for every distinct ID in the left most column, but I still want to pull every record for each ID in which there are different Product_ID's as well.
I tried partitioning and using row_number, but I am having trouble at the moment.
Here are my desired results:
This is what my code looks like currently:
select *
from
(SELECT *,
ROW_NUMBER() OVER (PARTITION BY ID ORDER BY DIST_NO DESC) RN
FROM Table) V
WHERE RN<=3
you want the max(DIST_NO) for each ID, product_ID?
If so, you can:
SELECT
ID, product_ID, max(DIST_NO)
from table
group by ID, product_ID
If you want the detail rows related to the max row, you just need to join it back to your table:
Select
t.ID, max_dist_no, TRANSaction_ID , LINE_NO , PRODUCT_ID
from
table t inner join
(SELECT
ID, max(DIST_NO) as max_dist_no
from table
group by ID) mx on
t.ID = mx.ID and
t.DIST_NO = max_DIST_NO
Try
SELECT MT.ID
, MT.DIST_NO
, MT.TRANS_ID
, MT.LINE_NO
, MT.PRODUCT_ID
FROM MYTABLE MT
INNER JOIN (
SELECT T.ID, MAX(T.DIST_NO) as DIST_NO FROM MYTABLE T
GROUP BY T.ID
) MAX_MT ON MT.Id = MAX_MT.ID AND MT.DIST_NO = MAX_MT.DIST_NO
The sub query returns each combination of ID and Max value of DIST_NO:
SELECT T.ID, MAX(T.DIST_NO) as DIST_NO FROM MYTABLE T
GROUP BY T.ID
Joining this back to your original table will basically filter your original data-set by only these combinations of values.
Tested on PostgreSQL:
WITH t1 AS (
SELECT id, product_id, MAX(dist_no) AS dist_no
FROM test
GROUP BY 1,2)
SELECT t1.id, t1.dist_no, t2.trans_id, t2.line_no, t1.product_id
FROM test t2, t1
WHERE t1.id=t2.id AND t1.product_id=t2.product_id AND t1.dist_no=t2.dist_no
Use rank() or dense_rank():
select t.*
from (SELECT t.*
RANK() OVER (PARTITION BY ID ORDER BY DIST_NO DESC) as seqnum
FROM Table t
) t
WHERE seqnum = 1;
This is almost a literal translation of your request:
I am trying to pull all records from this table for the max value in
the DIST_NO column for every distinct ID in the left most column.
you can try something like this one :). (But is your result correct? I think there is little mistake in TRANS_ID...)
DECLARE #ExampleTable TABLE
(ID INT,
DIST_NO INT,
TRANS_ID INT,
LINE_NO INT,
PRODUCT_ID INT)
INSERT INTO #ExampleTable
( ID, DIST_NO, TRANS_ID,LINE_NO, PRODUCT_ID )
VALUES ( 102657, 1, 1105365, 1, 109119 ),
( 102657, 1, 1105366, 2, 109114 ),
( 102657, 2, 1105365, 1, 109119 ),
( 102657, 2, 1105366, 2, 109114 ),
( 104371, 1, 1190538, 1, 110981 ),
( 104371, 2, 1190538, 1, 110981 )
;WITH CTE AS ( SELECT DISTINCT ID, LINE_NO
FROM #ExampleTable)
SELECT a.ID,
x.DIST_NO,
x.TRANS_ID,
x.LINE_NO,
x.PRODUCT_ID
FROM CTE a
CROSS APPLY (SELECT TOP 1 *
FROM #ExampleTable f
WHERE a.ID = f.ID AND
a.LINE_NO = f. LINE_NO
ORDER BY DIST_NO DESC) x

SQL:selecting top record for a group of column

I need to select top record with max value for a group of other columns. In the sample data below, i need to select top record with max 'Weightage' for each group of 'id', 'ItemType'
create table sampleTable(id int, ItemType varchar(10), ItemCode varchar(10), Weightage int)
insert into sampleTable values(1, 'A','aaa',2)
insert into sampleTable values(1, 'A','bbb',3)
insert into sampleTable values(1, 'A','ccc',3)
insert into sampleTable values(1, 'B','ddd',1)
insert into sampleTable values(1, 'B','eee',2)
insert into sampleTable values(2, 'A','aaa',1)
The expected output should be
id ItemType ItemCode
--------------------------------
1 A bbb
1 B eee
2 A aaa
I tried as follows
SELECT top 1 id, ItemType,ItemCode
FROM sampleTable WITH(NOLOCK)
GROUP BY id,ItemType,ItemCode,Weightage
ORDER BY Weightage desc
But it is not giving expected result. Thanks
Here is one way using ROW_NUMBER
SELECT TOP 1 WITH ties id,
itemtype,
itemcode
FROM sampletable WITH(nolock)
GROUP BY id,
itemtype,
itemcode,
weightage
ORDER BY Row_number()OVER(partition BY id, itemtype ORDER BY weightage DESC)
TOP 1 with TIES will return the records with Tie based on Order by
Hope you know the meaning of using WITH(NOLOCK) hint. It will pull uncommitted data as well
Here's one option using row_number():
select *
from (
select *, row_number() over (partition by id, itemtype order by Weightage desc) rn
from sampletable
) t
where rn = 1
SQL Fiddle Demo

How to select top 3 values from each group in a table with SQL which have duplicates [duplicate]

This question already has answers here:
Select top 10 records for each category
(14 answers)
Closed 5 years ago.
Assume we have a table which has two columns, one column contains the names of some people and the other column contains some values related to each person. One person can have more than one value. Each value has a numeric type. The question is we want to select the top 3 values for each person from the table. If one person has less than 3 values, we select all the values for that person.
The issue can be solved if there are no duplicates in the table by the query provided in this article Select top 3 values from each group in a table with SQL . But if there are duplicates, what is the solution?
For example, if for one name John, he has 5 values related to him. They are 20,7,7,7,4. I need to return the name/value pairs as below order by value descending for each name:
-----------+-------+
| name | value |
-----------+-------+
| John | 20 |
| John | 7 |
| John | 7 |
-----------+-------+
Only 3 rows should be returned for John even though there are three 7s for John.
In many modern DBMS (e.g. Postgres, Oracle, SQL-Server, DB2 and many others), the following will work just fine. It uses CTEs and ranking function ROW_NUMBER() which is part of the latest SQL standard:
WITH cte AS
( SELECT name, value,
ROW_NUMBER() OVER (PARTITION BY name
ORDER BY value DESC
)
AS rn
FROM t
)
SELECT name, value, rn
FROM cte
WHERE rn <= 3
ORDER BY name, rn ;
Without CTE, only ROW_NUMBER():
SELECT name, value, rn
FROM
( SELECT name, value,
ROW_NUMBER() OVER (PARTITION BY name
ORDER BY value DESC
)
AS rn
FROM t
) tmp
WHERE rn <= 3
ORDER BY name, rn ;
Tested in:
Postgres
Oracle
SQL-Server
In MySQL and other DBMS that do not have ranking functions, one has to use either derived tables, correlated subqueries or self-joins with GROUP BY.
The (tid) is assumed to be the primary key of the table:
SELECT t.tid, t.name, t.value, -- self join and GROUP BY
COUNT(*) AS rn
FROM t
JOIN t AS t2
ON t2.name = t.name
AND ( t2.value > t.value
OR t2.value = t.value
AND t2.tid <= t.tid
)
GROUP BY t.tid, t.name, t.value
HAVING COUNT(*) <= 3
ORDER BY name, rn ;
SELECT t.tid, t.name, t.value, rn
FROM
( SELECT t.tid, t.name, t.value,
( SELECT COUNT(*) -- inline, correlated subquery
FROM t AS t2
WHERE t2.name = t.name
AND ( t2.value > t.value
OR t2.value = t.value
AND t2.tid <= t.tid
)
) AS rn
FROM t
) AS t
WHERE rn <= 3
ORDER BY name, rn ;
Tested in MySQL
I was going to downvote the question. However, I realized that it might really be asking for a cross-database solution.
Assuming you are looking for a database independent way to do this, the only way I can think of uses correlated subqueries (or non-equijoins). Here is an example:
select distinct t.personid, val, rank
from (select t.*,
(select COUNT(distinct val) from t t2 where t2.personid = t.personid and t2.val >= t.val
) as rank
from t
) t
where rank in (1, 2, 3)
However, each database that you mention (and I note, Hadoop is not a database) has a better way of doing this. Unfortunately, none of them are standard SQL.
Here is an example of it working in SQL Server:
with t as (
select 1 as personid, 5 as val union all
select 1 as personid, 6 as val union all
select 1 as personid, 6 as val union all
select 1 as personid, 7 as val union all
select 1 as personid, 8 as val
)
select distinct t.personid, val, rank
from (select t.*,
(select COUNT(distinct val) from t t2 where t2.personid = t.personid and t2.val >= t.val
) as rank
from t
) t
where rank in (1, 2, 3);
Using GROUP_CONCAT and FIND_IN_SET you can do that.Check SQLFIDDLE.
SELECT *
FROM tbl t
WHERE FIND_IN_SET(t.value,(SELECT
SUBSTRING_INDEX(GROUP_CONCAT(t1.value ORDER BY VALUE DESC),',',3)
FROM tbl t1
WHERE t1.name = t.name
GROUP BY t1.name)) > 0
ORDER BY t.name,t.value desc
If your result set is not so heavy, you can write a stored procedure (or an anonymous PL/SQL-block) for that problem which iterates the result set and finds the bigges three by a simple comparing algorithm.
Try this -
CREATE TABLE #list ([name] [varchar](100) NOT NULL, [value] [int] NOT NULL)
INSERT INTO #list VALUES ('John', 20), ('John', 7), ('John', 7), ('John', 7), ('John', 4);
WITH cte
AS (
SELECT NAME
,value
,ROW_NUMBER() OVER (
PARTITION BY NAME ORDER BY (value) DESC
) RN
FROM #list
)
SELECT NAME
,value
FROM cte
WHERE RN < 4
ORDER BY value DESC
This works for MS SQL. Should be workable in any other SQL dialect that has the ability to assign row numbers in a group by or over clause (or equivelant)
if object_id('tempdb..#Data') is not null drop table #Data;
GO
create table #data (name varchar(25), value integer);
GO
set nocount on;
insert into #data values ('John', 20);
insert into #data values ('John', 7);
insert into #data values ('John', 7);
insert into #data values ('John', 7);
insert into #data values ('John', 5);
insert into #data values ('Jack', 5);
insert into #data values ('Jane', 30);
insert into #data values ('Jane', 21);
insert into #data values ('John', 5);
insert into #data values ('John', -1);
insert into #data values ('John', -1);
insert into #data values ('Jane', 18);
set nocount off;
GO
with D as (
SELECT
name
,Value
,row_number() over (partition by name order by value desc) rn
From
#Data
)
SELECT Name, Value
FROM D
WHERE RN <= 3
order by Name, Value Desc
Name Value
Jack 5
Jane 30
Jane 21
Jane 18
John 20
John 7
John 7

retrieve the most recent record for each customer

I have this data:
ID NAME DATE
3 JOHN 2011-08-08
2 YOKO 2010-07-07
1 JOHN 2009-06-06
Code (for SQL Server 2005):
DECLARE #TESTABLE TABLE (id int, name char(4), date smalldatetime)
INSERT INTO #TESTABLE VALUES (3, 'JOHN', '2011-08-08')
INSERT INTO #TESTABLE VALUES (2, 'YOKO', '2010-07-07')
INSERT INTO #TESTABLE VALUES (1, 'JOHN', '2009-06-06')
I want to get, for each NAME, the ID that has the most recent DATE. Like this:
3 JOHN 2011-08-08
2 YOKO 2010-07-07
What is the most elegant way of accomplishing this?
;WITH x AS
(
SELECT ID, NAME, [DATE],
rn = ROW_NUMBER() OVER
(PARTITION BY NAME ORDER BY [DATE] DESC)
FROM #TESTABLE
)
SELECT ID, NAME, [DATE] FROM x WHERE rn = 1
ORDER BY [DATE] DESC;
Try to avoid reserved words (and vague column names) like [DATE]...
SELECT <fields>
FROM SourceTable st
INNER JOIN (SELECT name, MAX(Datefield) as Datefield
FROM SourceTable
GROUP BY name) x
ON x.Name = st.name
AND x.datefield = st.datefield
below is a possible solution:
Select c.CustomerID, c.CustomerName, c.CustomerOrder, c.CustomerOrderDate, c.CustomerQty
from tblCustomer c
inner join (select c2.CustomerName, MAX(c2.CustomerOrderDate) as MaxDate from tblCustomer c2 group by c2.CustomerName) c2
on c.CustomerName = c2.CustomerName
where c.CustomerOrderDate = c2.MaxDate