How do I select TOP 5 PERCENT from each group? - sql

I have a sample table like this:
CREATE TABLE #TEMP(Category VARCHAR(100), Name VARCHAR(100))
INSERT INTO #TEMP VALUES('A', 'John')
INSERT INTO #TEMP VALUES('A', 'John')
INSERT INTO #TEMP VALUES('A', 'John')
INSERT INTO #TEMP VALUES('A', 'John')
INSERT INTO #TEMP VALUES('A', 'John')
INSERT INTO #TEMP VALUES('A', 'John')
INSERT INTO #TEMP VALUES('A', 'Adam')
INSERT INTO #TEMP VALUES('A', 'Adam')
INSERT INTO #TEMP VALUES('A', 'Adam')
INSERT INTO #TEMP VALUES('A', 'Adam')
INSERT INTO #TEMP VALUES('A', 'Lisa')
INSERT INTO #TEMP VALUES('A', 'Lisa')
INSERT INTO #TEMP VALUES('A', 'Bucky')
INSERT INTO #TEMP VALUES('B', 'Lily')
INSERT INTO #TEMP VALUES('B', 'Lily')
INSERT INTO #TEMP VALUES('B', 'Lily')
INSERT INTO #TEMP VALUES('B', 'Lily')
INSERT INTO #TEMP VALUES('B', 'Lily')
INSERT INTO #TEMP VALUES('B', 'Tom')
INSERT INTO #TEMP VALUES('B', 'Tom')
INSERT INTO #TEMP VALUES('B', 'Tom')
INSERT INTO #TEMP VALUES('B', 'Tom')
INSERT INTO #TEMP VALUES('B', 'Ross')
INSERT INTO #TEMP VALUES('B', 'Ross')
INSERT INTO #TEMP VALUES('B', 'Ross')
SELECT Category, Name, COUNT(Name) Total
FROM #TEMP
GROUP BY Category, Name
ORDER BY Category, Total DESC
DROP TABLE #TEMP
Gives me the following:
A John 6
A Adam 4
A Lisa 2
A Bucky 1
B Lily 5
B Tom 4
B Ross 3
Now, how do I select the TOP 5 PERCENT records from each category assuming each category has more than 100 records (did not show in sample table here)? For instance, in my actual table, it should remove the John record from A and Lily record from B as appropriate (again, I did not show the full table here) to get:
A Adam 4
A Lisa 2
A Bucky 1
B Tom 4
B Ross 3
I have been trying to use CTEs and PARTITION BY clauses but cannot seem to achieve what I want. It removes the TOP 5 PERCENT from the overall result but not from each category. Any suggestions?

You could use a CTE (Common Table Expression) paired with the NTILE windowing function - this will slice up your data into as many slices as you need, e.g. in your case, into 20 slices (each 5%).
;WITH SlicedData AS
(
SELECT Category, Name, COUNT(Name) Total,
NTILE(20) OVER(PARTITION BY Category ORDER BY COUNT(Name) DESC) AS 'NTile'
FROM #TEMP
GROUP BY Category, Name
)
SELECT *
FROM SlicedData
WHERE NTile > 1
This basically groups your data by Category,Name, orders by something else (not sure if COUNT(Name) is really the thing you want here), and then slices it up into 20 pieces, each representing 5% of your data partition. The slice with NTile = 1 is the top 5% slice - just ignore that when selecting from the CTE.
See:
MSDN docs on NTILE
SQL Server 2005 ranking functions
SQL SERVER – 2005 – Sample Example of RANKING Functions – ROW_NUMBER, RANK, DENSE_RANK, NTILE
for more info

select Category,name,CountTotal,RankSeq,(50*CountTotal)/100 from (
select Category,name,COUNT(*)
over (partition by Category,name ) as CountTotal,
ROW_NUMBER()
over (partition by Category,name order by Category) RankSeq from #TEMP
--group by Category,Name
) temp
where RankSeq <= ((50*CountTotal)/100)
order by Category,Name,RankSeq
Output:
Category name CountTotal RankSeq 50*CountTotal)/100
A Adam 4 1 2
A Adam 4 2 2
A John 6 1 3
A John 6 2 3
A John 6 3 3
A Lisa 2 1 1
B Lily 5 1 2
B Lily 5 2 2
B Ross 3 1 1
B Tom 4 1 2
B Tom 4 2 2
I hope this helps :)

;WITH SlicedData AS
(
SELECT Category, Name, COUNT(Name) Total,
**PERCENT_RANK() OVER(PARTITION BY Category ORDER BY COUNT(Name) DESC) * 100** AS 'Percent'
FROM #TEMP
GROUP BY Category, Name
)
SELECT *
FROM SlicedData
WHERE Percent < 5
NTile will not work if number of records is less than your tile number.

Related

Sequential numbering of data sequence

I do have the following table:
create table test_seq (id int, obs int);
insert into test_seq values (1,1);
insert into test_seq values (2,1);
insert into test_seq values (3,1);
insert into test_seq values (4,0);
insert into test_seq values (5,0);
insert into test_seq values (6,1);
insert into test_seq values (7,1);
insert into test_seq values (8,0);
insert into test_seq values (9,0);
insert into test_seq values (10,1);
insert into test_seq values (11,0);
Is there s SQL way, how to create the following output?
id obs seq_num
1 1 1
2 1 1
3 1 1
4 0 2
5 0 2
6 1 3
7 1 3
8 0 4
9 0 4
10 1 5
11 0 6
seq_num is increased by 1 every time when value in column obs is changed compared to the previous row (ordered by id). I can solve this easily in Excel (using simple if formula), but can't figure out this in postgres.
using analytic functions, something like :
select id, obs, sum(cnt) over (order by id) as seq_num
from (
select id, obs, case when obs <> (lag(obs) over (order by id)) then 1 else 0 end as cnt
from test_seq
)
order by id;
I've figured it out:
with t as (
select
id,
obs,
case when lag(obs,1) over (order by id) <> obs then 1 else 0 end as test
from
tmp.test_seq
)
select
*,
sum(test) over (order by id rows between unbounded preceding and current row) + 1
from
t
You can use rownumber to do that.

Assigning a Row Number in SQL Server, but grouped on a value

I want to select 2 columns from a table, and assign a int value to each value. However, I want the 1st column ID to be the same for all values that are the same.
For the 2nd column, I want each value to numbered as well, but partitioned by the first column. I have figured this piece out, but I can't get the first part to work.
Here is the test scenario I'm using.
DECLARE #TestTable as Table (Column1 char(1), Column2 char(1))
INSERT INTO #TestTable SELECT 'A','A'
INSERT INTO #TestTable SELECT 'A','B'
INSERT INTO #TestTable SELECT 'A','C'
INSERT INTO #TestTable SELECT 'B','D'
INSERT INTO #TestTable SELECT 'B','E'
INSERT INTO #TestTable SELECT 'B','F'
INSERT INTO #TestTable SELECT 'B','G'
INSERT INTO #TestTable SELECT 'B','H'
INSERT INTO #TestTable SELECT 'C','A'
INSERT INTO #TestTable SELECT 'C','B'
INSERT INTO #TestTable SELECT 'C','C'
SELECT
Row_Number() OVER (Partition BY Column1 ORDER BY Column1) as Column1_ID,
Column1,
Row_Number() OVER (Partition BY Column1 ORDER BY Column1, Column2) as Column2_ID,
Column2
FROM #TestTable
When I run this, the values in Column2_ID are correct, but I would like the values for Column1_ID to be as follows.
Column1_ID Column1 Column2_ID Column2
1 A 1 A
1 A 2 B
1 A 3 C
2 B 1 D
2 B 2 E
2 B 3 F
2 B 4 G
2 B 5 H
3 C 1 A
3 C 2 B
3 C 3 C
You just need to use a different ranking function,
dense_rank() OVER (ORDER BY Column1) as Column1_ID
http://msdn.microsoft.com/en-us/library/ms173825.aspx
SQL Fiddle : http://www.sqlfiddle.com/#!6/d41d8/1832

find start and end dates over a non-contiguous range

I need to find the start and end dates of range defined as: start date is the first date and the end date is the first date where the subsequent date is two months or more after the end date. There can be multiple possible ranges
I have a table structure like:
ID int identity(1,1),
fk_ID char(9),
dateField datetime
The data looks like:
1 a 2012-01-01
2 a 2012-01-05
3 a 2012-01-12
4 b 2012-02-01
5 a 2012-04-01
6 b 2012-05-01
7 a 2012-05-30
The expected output would look like:
fk_id startdate enddate
a 2012-01-01 2012-01-12
a 2012-04-01 2012-05-30
b 2012-02-01 2012-02-01
b 2012-05-01 null
EDIT:
By doing the following:
CREATE TABLE #temp
(
autonum int identity(1,1),
id char(9),
sd datetime
)
insert into #temp (id, sd) values ('a', '2012-01-01')
insert into #temp (id, sd) values ('a', '2012-01-05')
insert into #temp (id, sd) values ('a', '2012-01-12')
insert into #temp (id, sd) values ('a', '2012-03-01')
insert into #temp (id, sd) values ('a', '2012-04-03')
insert into #temp (id, sd) values ('a', '2012-06-06')
insert into #temp (id, sd) values ('b', '2012-02-12')
insert into #temp (id, sd) values ('b', '2012-02-15')
insert into #temp (id, sd) values ('b', '2012-03-01')
insert into #temp (id, sd) values ('b', '2012-04-03')
insert into #temp (id, sd) values ('b', '2012-06-01')
select t1.id, null as previousend, min(t1.sd) as nextstart
from #temp t1
group by t1.id
union
select t1.id, t1.sd as enddate, (select min(t2.sd) from #temp t2 where t1.id=t2.id and t2.sd>t1.sd) as nextstart
from #temp t1
where (select min(t2.sd) from #temp t2 where t1.id=t2.id and t2.sd>t1.sd) >= dateadd(month, 2, t1.sd)
union
select t1.id, max(t1.sd), null
from #temp t1
group by t1.id
drop table #temp
I can get output like this:
id previousend nextstart
--------- ----------------------- -----------------------
a NULL 2012-01-01 00:00:00.000
a 2012-04-03 00:00:00.000 2012-06-06 00:00:00.000
a 2012-06-06 00:00:00.000 NULL
b NULL 2012-02-12 00:00:00.000
b 2012-06-01 00:00:00.000 NULL
Which is very close, but ideally the start and end date of the range would be on the row.
Here is my best guess given all the changes to the question. I still find the problem very confusing, splintered and that the desired results for the two cases don't seem to match. With this query:
;WITH x AS
(
SELECT a.id, sd = a.sd, ed = b.sd, rn1 = ROW_NUMBER() OVER
(PARTITION BY a.id, a.sd ORDER BY a.sd)
FROM #temp AS a
LEFT OUTER JOIN #temp AS b
ON a.id = b.id
AND b.sd >= a.sd
AND b.sd <= DATEADD(MONTH, 2, a.sd)
),
y AS
(SELECT id, sd,
ed = (SELECT MAX(ed) FROM x AS x2
WHERE x.id = x2.id AND x2.sd <= DATEADD(MONTH, 2, x.sd)
)
FROM x
WHERE rn1 = 1
),
z AS
(
SELECT id, sd = MIN(sd), ed
FROM y GROUP BY id, ed
)
SELECT id, sd, ed /* = CASE
WHEN ed > sd OR (sd = ed AND NOT EXISTS
(SELECT 1 FROM z AS z2 WHERE z2.id = z.id AND z.sd > z2.sd)) THEN ed END
*/
FROM z
ORDER BY id, sd;
The results for your first set of data:
INSERT #temp (id, sd) VALUES
('a','2012-01-01'),
('a','2012-01-05'),
('a','2012-01-12'),
('b','2012-02-01'),
('a','2012-04-01'),
('b','2012-05-01'),
('a','2012-05-30');
Is as follows:
id sd ed
a 2012-01-01 2012-01-12
a 2012-04-01 2012-05-30
b 2012-02-01 2012-02-01
b 2012-05-01 2012-05-01
And for the second set:
insert into #temp (id, sd) values ('a', '2012-01-01')
insert into #temp (id, sd) values ('a', '2012-01-05')
insert into #temp (id, sd) values ('a', '2012-01-12')
insert into #temp (id, sd) values ('a', '2012-03-01')
insert into #temp (id, sd) values ('a', '2012-04-03')
insert into #temp (id, sd) values ('a', '2012-06-06')
insert into #temp (id, sd) values ('b', '2012-02-12')
insert into #temp (id, sd) values ('b', '2012-02-15')
insert into #temp (id, sd) values ('b', '2012-03-01')
insert into #temp (id, sd) values ('b', '2012-04-03')
insert into #temp (id, sd) values ('b', '2012-06-01')
Is as follows:
id sd ed
a 2012-01-01 2012-04-03
a 2012-06-06 2012-06-06
b 2012-02-12 2012-06-01
If you uncomment the CASE block you'll get NULLs for the end date where the start date and end date are the same. As I suggested multiple times, your question is splintered and your desired results don't seem to match, so I'm not sure what the right answer is.
attempt number two which is on Fiddle and is far from elegant but seems to work apart from the final record not being NULL for the end date:
CREATE TABLE temp
(
id char(9),
d datetime
);
insert into temp (id, d) values ('a', '2012-01-01');
insert into temp (id, d) values ('a', '2012-01-05');
insert into temp (id, d) values ('a', '2012-01-12');
insert into temp (id, d) values ('a', '2012-04-01');
insert into temp (id, d) values ('a', '2012-05-30');
insert into temp (id, d) values ('b', '2012-02-01');
insert into temp (id, d) values ('b', '2012-05-01');
SELECT
x.id ,
min(x.sd) sd ,
x.ed
FROM
(SELECT
a.id ,
a.sd ,
max(a.ed) ed
FROM
(
SELECT
j.id ,
j.d sd ,
q.D ed
FROM temp j
JOIN temp q
ON
j.id = q.id
AND j.d <= q.d
GROUP BY j.id ,
j.d ,
q.d
) a
WHERE datediff(m,a.sd,a.ed)<=2
GROUP BY a.id ,
a.sd
)x
GROUP BY x.id ,
x.ed
ORDER BY x.id ,
min(x.sd) ,
x.ed

Select 30% of each column value

Let's assume we have a table with a column 'A' that has values from 0 to N. And I want to select 30% each rows that have the same value for the column 'A'.
So if I have this:
A| B
-------
0 hello
0 test
0 hi
1 blah1
1 blah2
1 blah3
1 blah4
1 blah5
1 blah6
Result:
A| B
-------
0 hello
1 blah1
1 blah4
it could be blah1 or any other blah that is not blah4, and blah4 can be any other blah that is not blah1, basically it could be random or skipping.
By the way, the actual table is huge, talking terabytes, so think about performance.
try something like this:
DECLARE #YourTable table (A int, b varchar(10))
INSERT #YourTable VALUES (0, 'hello') --OP's data
INSERT #YourTable VALUES (0, 'test')
INSERT #YourTable VALUES (0, 'hi')
INSERT #YourTable VALUES (1, 'blah1')
INSERT #YourTable VALUES (1, 'blah2')
INSERT #YourTable VALUES (1, 'blah3')
INSERT #YourTable VALUES (1, 'blah4')
INSERT #YourTable VALUES (1, 'blah5')
INSERT #YourTable VALUES (1, 'blah6')
;WITH NumberedRows AS
( SELECT
A,B,ROW_NUMBER() OVER (PARTITION BY A ORDER BY A,B) AS RowNumber
FROM #YourTable
)
, GroupCounts AS
( SELECT
A,MAX(RowNumber) AS MaxA
FROM NumberedRows
GROUP BY A
)
SELECT
n.a,n.b
FROM NumberedRows n
INNER JOIN GroupCounts c ON n.A=c.A
WHERE n.RowNUmber<=(c.MaxA+1)*0.3
OUTPUT:
a b
----------- ----------
0 hello
1 blah1
1 blah2
(3 row(s) affected)
EDIT based on the great idea in the comment from Andriy M
;WITH NumberedRows AS
( SELECT
A,B,ROW_NUMBER() OVER (PARTITION BY A ORDER BY A,B) AS RowNumber
,COUNT(*) OVER (PARTITION BY A) AS TotalOf
FROM #YourTable
)
SELECT
n.a,n.b
FROM NumberedRows n
WHERE n.RowNumber<=(n.TotalOf+1)*0.3
ORDER BY A
OUTPUT:
a b
----------- ----------
0 hello
1 blah1
1 blah2
(3 row(s) affected)
EDIT here are "random" rows, using Andriy M idea:
DECLARE #YourTable table (A int, b varchar(10))
INSERT #YourTable VALUES (0, 'hello') --OP's data
INSERT #YourTable VALUES (0, 'test')
INSERT #YourTable VALUES (0, 'hi')
INSERT #YourTable VALUES (1, 'blah1')
INSERT #YourTable VALUES (1, 'blah2')
INSERT #YourTable VALUES (1, 'blah3')
INSERT #YourTable VALUES (1, 'blah4')
INSERT #YourTable VALUES (1, 'blah5')
INSERT #YourTable VALUES (1, 'blah6')
;WITH NumberedRows AS
( SELECT
A,B,ROW_NUMBER() OVER (PARTITION BY A ORDER BY newid()) AS RowNumber
FROM #YourTable
)
, GroupCounts AS (SELECT A,COUNT(A) AS MaxA FROM NumberedRows GROUP BY A)
SELECT
n.A,n.B
FROM NumberedRows n
INNER JOIN GroupCounts c ON n.A=c.A
WHERE n.RowNUmber<=(c.MaxA+1)*0.3
ORDER BY n.A
OUTPUT:
a b
----------- ----------
0 hi
1 blah3
1 blah6
(3 row(s) affected)
This uses only one subquery, and thus a single pass through your set.
SELECT a
, b
FROM
(
SELECT A
, b
, ROW_NUMBER()
OVER( PARTITION BY A
ORDER BY b
) r
, COUNT(b)
OVER( PARTITION BY A
) ct
FROM #YourTable
) n
WHERE n.r <= n.ct * 0.3
As does this, although this always returns the top 3 if there are fewer than 10 and "extras" get posted to the first bins.:
SELECT A
, b
FROM
(
SELECT A
, b
, NTILE(10)
OVER( PARTITION BY a
ORDER BY b
) tens
FROM #YourTable
) n
WHERE tens <= 3;

How can I group a set split by change in a field with respect to an order?

I have a set of records.
ID Value
1 a
2 b
3 b
4 b
5 a
6 a
7 b
8 b
And I would like to group them like so.
MIN(ID) MAX(ID) Value
1 1 a
2 4 b
5 6 a
7 8 b
I'm vaguely aware of oracle over() analytical function which looks to be the right direction, but I don't know what this problem is called much less how to solve it.
Probably an easier way, but this may help to start. I ran it on Postgres, but should work (maybe with a minor tweak) on Oracle. The inner most query puts the previous value on each row. We can use that to detect a grouping change (when value does not equal previous value). Every time there is a group change, we flag it with a "1". Sum these group changes and we now have a group id which increments every time there is a value change. Then we can perform our normal group by function.
create table x(id int, value varchar(1));
insert into x values(1, 'a');
insert into x values(2, 'b');
insert into x values(3, 'b');
insert into x values(4, 'b');
insert into x values(5, 'a');
insert into x values(6, 'a');
insert into x values(7, 'b');
insert into x values(8, 'b');
SELECT MIN(id), MAX(id), value
FROM ( SELECT id
,value
,previous_value
,SUM( CASE WHEN value = previous_value THEN 0 ELSE 1 END ) OVER(ORDER BY id) AS group_id
FROM ( SELECT id
,value
,COALESCE( LAG(value) OVER(ORDER BY id), value ) previous_value
FROM x
ORDER BY id
) y
) z
GROUP BY group_id, value
ORDER BY 1, 2;
min | max | value
-----+-----+-------
1 | 1 | a
2 | 4 | b
5 | 6 | a
7 | 8 | b
(4 rows)