Select 30% of each column value - sql

Let's assume we have a table with a column 'A' that has values from 0 to N. And I want to select 30% each rows that have the same value for the column 'A'.
So if I have this:
A| B
-------
0 hello
0 test
0 hi
1 blah1
1 blah2
1 blah3
1 blah4
1 blah5
1 blah6
Result:
A| B
-------
0 hello
1 blah1
1 blah4
it could be blah1 or any other blah that is not blah4, and blah4 can be any other blah that is not blah1, basically it could be random or skipping.
By the way, the actual table is huge, talking terabytes, so think about performance.

try something like this:
DECLARE #YourTable table (A int, b varchar(10))
INSERT #YourTable VALUES (0, 'hello') --OP's data
INSERT #YourTable VALUES (0, 'test')
INSERT #YourTable VALUES (0, 'hi')
INSERT #YourTable VALUES (1, 'blah1')
INSERT #YourTable VALUES (1, 'blah2')
INSERT #YourTable VALUES (1, 'blah3')
INSERT #YourTable VALUES (1, 'blah4')
INSERT #YourTable VALUES (1, 'blah5')
INSERT #YourTable VALUES (1, 'blah6')
;WITH NumberedRows AS
( SELECT
A,B,ROW_NUMBER() OVER (PARTITION BY A ORDER BY A,B) AS RowNumber
FROM #YourTable
)
, GroupCounts AS
( SELECT
A,MAX(RowNumber) AS MaxA
FROM NumberedRows
GROUP BY A
)
SELECT
n.a,n.b
FROM NumberedRows n
INNER JOIN GroupCounts c ON n.A=c.A
WHERE n.RowNUmber<=(c.MaxA+1)*0.3
OUTPUT:
a b
----------- ----------
0 hello
1 blah1
1 blah2
(3 row(s) affected)
EDIT based on the great idea in the comment from Andriy M
;WITH NumberedRows AS
( SELECT
A,B,ROW_NUMBER() OVER (PARTITION BY A ORDER BY A,B) AS RowNumber
,COUNT(*) OVER (PARTITION BY A) AS TotalOf
FROM #YourTable
)
SELECT
n.a,n.b
FROM NumberedRows n
WHERE n.RowNumber<=(n.TotalOf+1)*0.3
ORDER BY A
OUTPUT:
a b
----------- ----------
0 hello
1 blah1
1 blah2
(3 row(s) affected)
EDIT here are "random" rows, using Andriy M idea:
DECLARE #YourTable table (A int, b varchar(10))
INSERT #YourTable VALUES (0, 'hello') --OP's data
INSERT #YourTable VALUES (0, 'test')
INSERT #YourTable VALUES (0, 'hi')
INSERT #YourTable VALUES (1, 'blah1')
INSERT #YourTable VALUES (1, 'blah2')
INSERT #YourTable VALUES (1, 'blah3')
INSERT #YourTable VALUES (1, 'blah4')
INSERT #YourTable VALUES (1, 'blah5')
INSERT #YourTable VALUES (1, 'blah6')
;WITH NumberedRows AS
( SELECT
A,B,ROW_NUMBER() OVER (PARTITION BY A ORDER BY newid()) AS RowNumber
FROM #YourTable
)
, GroupCounts AS (SELECT A,COUNT(A) AS MaxA FROM NumberedRows GROUP BY A)
SELECT
n.A,n.B
FROM NumberedRows n
INNER JOIN GroupCounts c ON n.A=c.A
WHERE n.RowNUmber<=(c.MaxA+1)*0.3
ORDER BY n.A
OUTPUT:
a b
----------- ----------
0 hi
1 blah3
1 blah6
(3 row(s) affected)

This uses only one subquery, and thus a single pass through your set.
SELECT a
, b
FROM
(
SELECT A
, b
, ROW_NUMBER()
OVER( PARTITION BY A
ORDER BY b
) r
, COUNT(b)
OVER( PARTITION BY A
) ct
FROM #YourTable
) n
WHERE n.r <= n.ct * 0.3
As does this, although this always returns the top 3 if there are fewer than 10 and "extras" get posted to the first bins.:
SELECT A
, b
FROM
(
SELECT A
, b
, NTILE(10)
OVER( PARTITION BY a
ORDER BY b
) tens
FROM #YourTable
) n
WHERE tens <= 3;

Related

SQL count number of records where value remains constant

I need to find the count of tracker_id where position remains 1 through out the table.
tracker_id | position
---------------------
5 | 1
11 | 1
4 | 1
4 | 2
5 | 2
4 | 1
4 | 1
11 | 1
14 | 1
9 | 2
Here, the output should be 2 since, position of tracker_id:11 and 14 remains 1 through out the table.
You can use not exists
select count(*) from tbl a
where not exists(select 1
from tbl b
where a.tracker_id = b.tracker_id
and a.position <> b.position )
and a.position = 1
Output: 2
declare #table1 as table (tracker_id int,postion int)
insert into #table1 values (5,1)
insert into #table1 values (11,1)
insert into #table1 values (4,1)
insert into #table1 values (4,2)
insert into #table1 values (5,2)
insert into #table1 values (4,1)
insert into #table1 values (4,1)
insert into #table1 values (11,1)
insert into #table1 values (14,1)
insert into #table1 values (9,2)
select count(tracker_id),tracker_id,postion from #table1 group by tracker_id,postion
You can also do:
select ( count(distinct tracker_id) -
count(distinct tracker_id) filter (where position <> 1)
) as num_all_1s
from t;
Using uncorrelated subquery
select count(distinct tracker_id)
from t
where position=1
and tracker_id not in (select tracker_id from t where position<>1);
Using window function
select count(distinct tracker_id)
from (select *, avg(position) over (partition by tracker_id) as avg_pos from t) a
where avg_pos=1;
This one is just for giggles
select distinct count(*) over ()
from t
group by tracker_id
having count(*) = sum(position);
And if you really want to have fun
select count(distinct tracker_id)-count(distinct case when position<>1 then tracker_id end)
from t;
If position can only be 1, then you can use this, which gets all the tracker_ids with only a single position value, and then limits that to those records where position = 1:
WITH agg AS
(
SELECT
tracker_id
, p = MAX(position)
FROM table1
GROUP BY tracker_id
HAVING COUNT(DISTINCT position) = 1
)
SELECT COUNT(tracker_id)
FROM agg
WHERE p = 1

Get two records per user id sql query

I am getting data
1 34 abc5
1 24 abc3
1 12 abc2
1 24 abc1
1 34 abc6
1 34 abc76
1 24 ab1c243
1 24 abc243
1 34 abc243
1 34 abc243
from my query .. is there any way to get output like this
my query
SELECT * FROM table2
WHERE (abc2 IN (SELECT * FROM table3 AS f INNER JOIN
table1 AS u ON u.id_usr = f.userLogedin_id
WHERE (u.id_usr = '13'))) AND (publish_status = '3')
ORDER BY guser_ID
1 34 abc5
1 34 abc6
1 24 abc3
1 24 abc1
1 12 abc2
i.e. orderby desc and limit to 2 per unique user .. in this case 34, 24 and 12 are unique user
try this..
with cte as
(
select row_number() over (partition by <col2> order by <col2> desc) as id, col1,col2,col3 from <tablename>
)
select * from cte where id<=2
declare #t table (id int,value int,name varchar(10))
insert into #t (id,value,name)values (1,34,'abc5')
insert into #t (id,value,name)values (1,24,'abc3')
insert into #t (id,value,name)values (1,12,'abc2')
insert into #t (id,value,name)values (1,24,'abc1')
insert into #t (id,value,name)values (1,34,'abc6')
insert into #t (id,value,name)values (1,34,'abc76')
insert into #t (id,value,name)values (1,24,'ab1c243')
insert into #t (id,value,name)values (1,24,'abc243')
insert into #t (id,value,name)values (1,34,'abc243')
insert into #t (id,value,name)values (1,34,'abc243')
SELECT t.*
FROM (
SELECT DISTINCT value
FROM #t
) tt
CROSS APPLY
(
SELECT TOP 2 *
FROM #t t
WHERE t.value = tt.value
ORDER BY
1 DESC
) t

Assigning a Row Number in SQL Server, but grouped on a value

I want to select 2 columns from a table, and assign a int value to each value. However, I want the 1st column ID to be the same for all values that are the same.
For the 2nd column, I want each value to numbered as well, but partitioned by the first column. I have figured this piece out, but I can't get the first part to work.
Here is the test scenario I'm using.
DECLARE #TestTable as Table (Column1 char(1), Column2 char(1))
INSERT INTO #TestTable SELECT 'A','A'
INSERT INTO #TestTable SELECT 'A','B'
INSERT INTO #TestTable SELECT 'A','C'
INSERT INTO #TestTable SELECT 'B','D'
INSERT INTO #TestTable SELECT 'B','E'
INSERT INTO #TestTable SELECT 'B','F'
INSERT INTO #TestTable SELECT 'B','G'
INSERT INTO #TestTable SELECT 'B','H'
INSERT INTO #TestTable SELECT 'C','A'
INSERT INTO #TestTable SELECT 'C','B'
INSERT INTO #TestTable SELECT 'C','C'
SELECT
Row_Number() OVER (Partition BY Column1 ORDER BY Column1) as Column1_ID,
Column1,
Row_Number() OVER (Partition BY Column1 ORDER BY Column1, Column2) as Column2_ID,
Column2
FROM #TestTable
When I run this, the values in Column2_ID are correct, but I would like the values for Column1_ID to be as follows.
Column1_ID Column1 Column2_ID Column2
1 A 1 A
1 A 2 B
1 A 3 C
2 B 1 D
2 B 2 E
2 B 3 F
2 B 4 G
2 B 5 H
3 C 1 A
3 C 2 B
3 C 3 C
You just need to use a different ranking function,
dense_rank() OVER (ORDER BY Column1) as Column1_ID
http://msdn.microsoft.com/en-us/library/ms173825.aspx
SQL Fiddle : http://www.sqlfiddle.com/#!6/d41d8/1832

How do I select TOP 5 PERCENT from each group?

I have a sample table like this:
CREATE TABLE #TEMP(Category VARCHAR(100), Name VARCHAR(100))
INSERT INTO #TEMP VALUES('A', 'John')
INSERT INTO #TEMP VALUES('A', 'John')
INSERT INTO #TEMP VALUES('A', 'John')
INSERT INTO #TEMP VALUES('A', 'John')
INSERT INTO #TEMP VALUES('A', 'John')
INSERT INTO #TEMP VALUES('A', 'John')
INSERT INTO #TEMP VALUES('A', 'Adam')
INSERT INTO #TEMP VALUES('A', 'Adam')
INSERT INTO #TEMP VALUES('A', 'Adam')
INSERT INTO #TEMP VALUES('A', 'Adam')
INSERT INTO #TEMP VALUES('A', 'Lisa')
INSERT INTO #TEMP VALUES('A', 'Lisa')
INSERT INTO #TEMP VALUES('A', 'Bucky')
INSERT INTO #TEMP VALUES('B', 'Lily')
INSERT INTO #TEMP VALUES('B', 'Lily')
INSERT INTO #TEMP VALUES('B', 'Lily')
INSERT INTO #TEMP VALUES('B', 'Lily')
INSERT INTO #TEMP VALUES('B', 'Lily')
INSERT INTO #TEMP VALUES('B', 'Tom')
INSERT INTO #TEMP VALUES('B', 'Tom')
INSERT INTO #TEMP VALUES('B', 'Tom')
INSERT INTO #TEMP VALUES('B', 'Tom')
INSERT INTO #TEMP VALUES('B', 'Ross')
INSERT INTO #TEMP VALUES('B', 'Ross')
INSERT INTO #TEMP VALUES('B', 'Ross')
SELECT Category, Name, COUNT(Name) Total
FROM #TEMP
GROUP BY Category, Name
ORDER BY Category, Total DESC
DROP TABLE #TEMP
Gives me the following:
A John 6
A Adam 4
A Lisa 2
A Bucky 1
B Lily 5
B Tom 4
B Ross 3
Now, how do I select the TOP 5 PERCENT records from each category assuming each category has more than 100 records (did not show in sample table here)? For instance, in my actual table, it should remove the John record from A and Lily record from B as appropriate (again, I did not show the full table here) to get:
A Adam 4
A Lisa 2
A Bucky 1
B Tom 4
B Ross 3
I have been trying to use CTEs and PARTITION BY clauses but cannot seem to achieve what I want. It removes the TOP 5 PERCENT from the overall result but not from each category. Any suggestions?
You could use a CTE (Common Table Expression) paired with the NTILE windowing function - this will slice up your data into as many slices as you need, e.g. in your case, into 20 slices (each 5%).
;WITH SlicedData AS
(
SELECT Category, Name, COUNT(Name) Total,
NTILE(20) OVER(PARTITION BY Category ORDER BY COUNT(Name) DESC) AS 'NTile'
FROM #TEMP
GROUP BY Category, Name
)
SELECT *
FROM SlicedData
WHERE NTile > 1
This basically groups your data by Category,Name, orders by something else (not sure if COUNT(Name) is really the thing you want here), and then slices it up into 20 pieces, each representing 5% of your data partition. The slice with NTile = 1 is the top 5% slice - just ignore that when selecting from the CTE.
See:
MSDN docs on NTILE
SQL Server 2005 ranking functions
SQL SERVER – 2005 – Sample Example of RANKING Functions – ROW_NUMBER, RANK, DENSE_RANK, NTILE
for more info
select Category,name,CountTotal,RankSeq,(50*CountTotal)/100 from (
select Category,name,COUNT(*)
over (partition by Category,name ) as CountTotal,
ROW_NUMBER()
over (partition by Category,name order by Category) RankSeq from #TEMP
--group by Category,Name
) temp
where RankSeq <= ((50*CountTotal)/100)
order by Category,Name,RankSeq
Output:
Category name CountTotal RankSeq 50*CountTotal)/100
A Adam 4 1 2
A Adam 4 2 2
A John 6 1 3
A John 6 2 3
A John 6 3 3
A Lisa 2 1 1
B Lily 5 1 2
B Lily 5 2 2
B Ross 3 1 1
B Tom 4 1 2
B Tom 4 2 2
I hope this helps :)
;WITH SlicedData AS
(
SELECT Category, Name, COUNT(Name) Total,
**PERCENT_RANK() OVER(PARTITION BY Category ORDER BY COUNT(Name) DESC) * 100** AS 'Percent'
FROM #TEMP
GROUP BY Category, Name
)
SELECT *
FROM SlicedData
WHERE Percent < 5
NTile will not work if number of records is less than your tile number.

Order by specific values in a column without using case statement

I would like to get the records in the below format:
if i have a record like
A, B, C, D
and I would like get record in this order -
B, A, C, D, E, F, G, H, so on,
But I need the value B should be at the first row...
try this:
SELECT
*, 1 AS SortBy
FROM YourTable
WHERE YourCol='B'
UNION ALL
SELECT
*, 2 AS SortBy
FROM YourTable
WHERE YourCol!='B'
ORDER BY SortBy, YourCol
You don't give any reason to not want to use CASE. I'd still give it a try and see which is faster, the UNION ALL or the CASE method:
SELECT
*
FROM YourTable
ORDER BY CASE WHEN YourCol='B' then 1 ELSE 2 END, YourCol
EDIT Working example:
DECLARE #YourTable table (YourCol char(1), RowValue varchar(5))
INSERT #YourTable VALUES ('A','aaa')
INSERT #YourTable VALUES ('A','aa')
INSERT #YourTable VALUES ('B','bbb')
INSERT #YourTable VALUES ('B','bb')
INSERT #YourTable VALUES ('C','ccc')
INSERT #YourTable VALUES ('D','ddd')
INSERT #YourTable VALUES ('E','eee')
INSERT #YourTable VALUES ('F','fff')
SELECT
*, 1 AS SortBy
FROM #YourTable
WHERE YourCol='B'
UNION ALL
SELECT
*, 2 AS SortBy
FROM #YourTable
WHERE YourCol!='B'
ORDER BY SortBy, YourCol
OUTPUT:
YourCol RowValue SortBy
------- -------- -----------
B bbb 1
B bb 1
A aaa 2
A aa 2
C ccc 2
D ddd 2
E eee 2
F fff 2
(8 row(s) affected)
SELECT * from mytable where mycolumn = "B";
followed by
SELECT * from mytable where mycolumn != "B" order by mycolumn asc;
Declare and populate table:
DECLARE #t TABLE (col1 CHAR)
INSERT #t
SELECT char(number+ 65)
FROM master..spt_values
WHERE type = 'P' AND number < 6
Query1:
SELECT *, cast(ascii(col1)-66 as bit) * 2 + ascii(col1) [orderby]
FROM #t
ORDER BY cast(ascii(col1)-66 as bit) * 2 + ascii(col1)
Query2:
SELECT *
FROM #t
ORDER BY replace(col1, 'B', ' ')
Result for Query1: (the [orderby] column is included for documentation only)
col1 orderby
---- --------
B 66
A 67
C 69
D 70
E 71
F 72