ABC sorting T-SQL - sql

I have a select output with this column, that has about 318 rows
skiped_times
-----------
967
967
244
153
125
116
116
116
116
116
116
98
94
89
73
73
72
66
61
60
60
60
60
60
55
48
...
The sum value of this column is about 6 100 and I want to another column sort these values into 3 groups, where for example in group A will be the first 20 % rows of counted values, in group B 30 % and in C the rest
I hope you understand my question

Use window functions:
select t.*,
(case when seqnum < num * 0.2 as 'A'
when seqnum < num * 0.5 as 'B'
else 'C'
end) as grp
from (select t.*,
rank() over (order by skiped_times desc) as seqnum,
count(*) over () as num
from t
) t;
Note: The use of rank() means that values with ties are placed in the same group. If they can split between groups, then use row_number().
EDIT:
If you want this by the sum of values -- well, first you should be clear in the question, not in a comment. You can use the same idea:
select t.*,
(case when running_sum < total * 0.2 as 'A'
when running_sum < total * 0.5 as 'B'
else 'C'
end) as grp
from (select t.*,
sum(skiped_times) over (order by skiped_times desc) as running_sum,
sum(skiped_times) over () as total
from t
) t;

Oh my friend.. You gave me a good training for this question.
Hear is a solution. I hope this work for you
Declare #Percent20 int
Declare #Percent30 int
set #Percent20 = (select count(skiped_times) from TABLE_NAME) * 0.20
select #Percent20
set #Percent30 = (select count(skiped_times) from TABLE_NAME) * 0.30
select #Percent30
select
ID,
(Case
when exists(select result.skiped_times where result.skiped_times in (select top (#Percent20) skiped_times from TABLE_NAME order by skiped_times ))
then 'A'
when Exists(select result.skiped_times where result.skiped_times in (select top (#Percent30) skiped_times from TABLE_NAME order by skiped_times ))
then 'B'
when Exists(select result.skiped_times where result.skiped_times in (select skiped_times from TABLE_NAME order by skiped_times offset (#Percent30) rows ))
then 'C'
end
) as Per
from (
select
s.skiped_times from TABLE_NAME s
) as result
order by result.skiped_times

Related

SQL count distinct per group divided by count distinct of total

I have:
id
value
1
123
1
124
1
125
2
126
2
127
2
127
3
128
3
128
3
128
I want an aggregation like:
id
distinct_count
total_distinct
percentage
1
3
6
0.5
2
2
6
0.33
3
1
6
0.167
I tried applying a window over clause like this:
SELECT id,
COUNT(DISTINCT value) AS distinct_count,
COUNT(DISTINCT value) OVER () AS total_distinct,
COUNT(DISTINCT value) / COUNT(DISTINCT value) OVER () AS percentage
FROM have
GROUP BY id
but it seems it is not implemented yet.
is there a way to achieve this without a join?
You can do this:
SELECT id,
COUNT(DISTINCT value) AS distinct_count,
(SELECT COUNT(DISTINCT value) FROM have) AS total_distinct,
(0.0+COUNT(DISTINCT value)) / (SELECT COUNT(DISTINCT value) FROM have) AS percentage
FROM have
GROUP BY id
or do:
WITH cte AS (SELECT COUNT(DISTINCT value) AS value FROM have)
SELECT
id,
COUNT(DISTINCT value) AS distinct_count,
cte.value AS total_distinct,
(0.0+COUNT(DISTINCT value)) / cte.value AS percentage
FROM have
CROSS APPLY cte
GROUP By cte.value,id;
An alternative method is to enumerate the values and use conditional aggregation:
SELECT id,
SUM(CASE WHEN seqnum_iv = 1 THEN 1 ELSE 0 END) as distinct_count,
SUM(CASE WHEN seqnum_v = 1 THEN 1 ELSE 0 END) as total_distinct_count,
(SUM(CASE WHEN seqnum_iv = 1 THEN 1.0 ELSE 0 END) /
SUM(CASE WHEN seqnum_v = 1 THEN 1.0 ELSE 0 END)
) as ratio
FROM (SELECT h.*,
ROW_NUMBER() OVER (PARTITION BY id, value ORDER BY value) as seqnum_iv,
ROW_NUMBER() OVER (PARTITION BY value ORDER BY value) as seqnum_v
FROM have h
) h
GROUP BY id;
This may be faster than an approach using subqueries.

Search sequence pattern in SQL Server

I have records like this and I want to search in SQL Server by pattern and sequence like (50,54,50) in value field it should return 02,03,04 any one have idea to do this.
======================================
Id Date Value
01 2020-01-01 50
02 2020-01-02 50
03 2020-01-03 54
04 2020-01-04 50
05 2020-01-05 35
06 2020-01-06 98
07 2020-01-07 13
======================================
There is a request on the user voice site Add support for Row Pattern Recognition in T-SQL (SQL:2016 features R010 and R020) which I believe would allow for this.
In the meantime this should do what you need
WITH T AS
(
SELECT *,
LAG(Id) OVER (ORDER BY Id) AS PrevId,
LAG(value) OVER (ORDER BY Id) AS PrevValue,
LEAD(Id) OVER (ORDER BY Id) AS NextId,
LEAD(value) OVER (ORDER BY Id) AS NextValue
FROM YourTable
)
SELECT PrevId, Id, NextId
FROM T
WHERE PrevValue = 50 AND Value =54 AND NextValue = 50
If you wanted a more flexible approach, you can use cross apply:
select t2.*
from t cross apply
(select string_agg(id, ',') within group (order by date) as ids,
string_agg(value, ',') within group (order by date) as vals
from (select top (3) t2.*
from t t2
where t2.date >= t.date
order by t2.date
) t2
) t2
where vals = '50,54,50';
Here is a db<>fiddle.
If string_agg() were supported as a window function, you could use:
select t.*
from (select t.*,
string_agg(id, ',') within group (order by date) over (order by id rows between current row and 2 following) as ids,
string_agg(value, ',') within group (order by date) over (order by id rows between current row and 2 following) as vals
from t
) t
where vals = '50,54,50';
But alas, it is not.
If I get your requirement correct, yo can try this below logic developed with the help of LAG and LEAD-
DEMO HERE
WITH CTE
AS
(
SELECT Id,Date,
LAG(value,2) OVER(ORDER BY id) lag_2,
LAG(value,1) OVER(ORDER BY id) lag_1,
Value c_val,
LEAD(value,1) OVER(ORDER BY id) lead_1,
LEAD(value,2) OVER(ORDER BY id) lead_2
FROM your_table
)
SELECT Id,Date,
CASE
WHEN (lag_2 = 50 AND lag_1 = 54 AND c_val = 50) OR
(lag_1 = 50 AND c_val = 54 AND lead_1 = 50) OR
(c_val = 50 AND lead_1 = 54 AND lead_2 = 50)
THEN (
CASE
WHEN lead_1 = 54 THEN 02
WHEN c_val = 54 THEN 03
WHEN lag_1 = 54 THEN 04
END
)
ELSE c_val
END
FROM CTE

stratified sample on ranges

I have table_1, that has data such as:
Range Start Range End Frequency
10 20 90
20 30 68
30 40 314
40 40 191 (here, it means we have just 40 as data point repeating 191 times)
table_2:
group value
10 56.1
10 88.3
20 53
20 20
30 55
I need to get the stratified sample on the basis of range from table_1, the table_2 can have millions of rows but the result should be restricted to just 10k points.
Tried below query:
SELECT
d.*
FROM
(
SELECT
ROW_NUMBER() OVER(
PARTITION BY group
ORDER BY group
) AS seqnum,
COUNT(*) OVER() AS ct,
COUNT(*) OVER(PARTITION BY group) AS cpt,
group, value
FROM
table_2 d
) d
WHERE
seqnum < 10000 * ( cpt * 1.0 / ct )
but a bit confused with the analytics functions usage here.
Expecting 10k records as a stratified sample from table_2:
Result table:
group value
10 56.1
20 53
20 20
30 55
It means you need atleast one record of each group and more records on random basis then try this:
SELECT GROUP, VALUE FROM
(SELECT T2.GROUP, T2.VALUE,
ROW_NUMBER()
OVER (PARTITION BY T2.GROUP ORDER BY NULL) AS RN
FROM TABLE_1 T1
JOIN TABLE_2 T2
ON(T1.RANGE = T2.GROUP))
WHERE RN = 1 OR
CASE WHEN RN > 1
AND RN = CEIL(DBMS_RANDOM.VALUE(1,RN))
THEN 1 END = 1
FETCH FIRST 10000 ROWS ONLY;
Here, Rownum is taken on random basis for each group and then result is taking rownum 1 and other rownum if they fulfill random condition.
Cheers!!
If I understand what you want - which is by no means certain - then I think you want to get a maximum of 10000 rows, with the number of group values proportional to the frequencies. So you can get the number of rows you want from each range with:
select range_start, range_end, frequency,
frequency/sum(frequency) over () as proportion,
floor(10000 * frequency/sum(frequency) over ()) as limit
from table_1;
RANGE_START RANGE_END FREQUENCY PROPORTION LIMIT
----------- ---------- ---------- ---------- ----------
10 20 90 .135746606 1357
20 30 68 .102564103 1025
30 40 314 .473604827 4736
40 40 191 .288084465 2880
Those limits don't quite add up to 10000; you could go slightly above with ceil instead of floor.
You can then assign a nominal row number to each entry in table_2 based on which range it is in, and then restrict the number of rows from that range via that limit:
with cte1 (range_start, range_end, limit) as (
select range_start, range_end, floor(10000 * frequency/sum(frequency) over ())
from table_1
),
cte2 (grp, value, limit, rn) as (
select t2.grp, t2.value, cte1.limit,
row_number() over (partition by cte1.range_start order by t2.value) as rn
from cte1
join table_2 t2
on (cte1.range_end > cte1.range_start and t2.grp >= cte1.range_start and t2.grp < cte1.range_end)
or (cte1.range_end = cte1.range_start and t2.grp = cte1.range_start)
)
select grp, value
from cte2
where rn <= limit;
...
9998 rows selected.
I've used order by t2.value in the row_number() call because it isn't clear how you want to pick which rows in the range you actually want; you might want to order by dbms_random.value or something else.
db<>fiddle with some artificial data.

Aggregate within a group of unchanged values

I have sample data:
RowId TypeId Value
1 1 34
2 1 53
3 1 34
4 2 43
5 2 65
6 16 54
7 16 34
8 1 45
9 6 43
10 6 34
11 16 64
12 16 63
I want to count row for each type (The Value does not matter to me), but only for... neighbor TypeId
TypeId Count
1 3
2 2
16 2
1 1
6 2
16 2
How to achieve this result?
This should give you COUNT of rows within a group of unchanged values:
SELECT TypeId, grp, COUNT(*) FROM (
SELECT RowId, TypeId , Value, gap, SUM(gap) over (ORDER BY RowId ) grp
FROM (SELECT RowId, TypeId , Value,
CASE WHEN TypeId = lag(TypeId) over (ORDER BY RowId )
THEN 0
ELSE 1
END gap
FROM dummy
) t
) tt
GROUP BY TypeId, grp;
If you prefer WITH over endless sub-query inclusions:
WITH dummy_with_groups AS (
SELECT RowId, TypeId , Value, SUM(gap) OVER (ORDER BY RowId) grp
FROM (SELECT RowId, TypeId , Value,
CASE WHEN TypeId = lag(TypeId) OVER (ORDER BY RowId)
THEN 0 ELSE 1 END gap
FROM dummy) t
)
SELECT TypeId, COUNT(*) as Result
FROM dummy_with_groups
GROUP BY TypeId, grp;
http://www.sqlfiddle.com/#!6/f16e9/34
Check this fiddle demo. I have renamed your columns a little.
WITH myCTE AS
(SELECT row_id,
type_id,
ROW_NUMBER () OVER (PARTITION BY type_id ORDER BY row_id)
AS cnt,
CASE LEAD (type_id) OVER (ORDER BY row_id)
WHEN type_id THEN 0
ELSE 1
END
AS show
FROM dummy),
innerQuery AS
(SELECT row_id, type_id, cnt
FROM myCTE
WHERE show = 1)
SELECT iq1.type_id, iq1.cnt - ISNULL (iq2.cnt, 0) CNT
FROM innerQuery iq1
LEFT OUTER JOIN innerQuery iq2
ON iq1.type_id = iq2.type_id
AND EXISTS
(SELECT 1
FROM innerQuery iq3
WHERE iq3.type_id = iq1.type_id
AND iq3.row_id < iq1.row_id
HAVING MAX (iq3.row_id) = iq2.row_id)
The output is exactly as expected.

Accumulate a summarized column

I could need some help with a SQL statement. So I have the table "cont" which looks like that:
cont_id name weight
----------- ---------- -----------
1 1 10
2 1 20
3 2 40
4 2 15
5 2 20
6 3 15
7 3 40
8 4 60
9 5 10
10 6 5
I then summed up the weight column and grouped it by the name:
name wsum
---------- -----------
2 75
4 60
3 55
1 30
5 10
6 5
And the result should have a accumulated column and should look like that:
name wsum acc_wsum
---------- ----------- ------------
2 75 75
4 60 135
3 55 190
1 30 220
5 10 230
6 5 235
But I didn't manage to get the last statement working..
edit: this Statement did it (thanks Gordon)
select t.*,
(select sum(wsum) from (select name, SUM(weight) wsum
from cont
group by name)
t2 where t2.wsum > t.wsum or (t2.wsum = t.wsum and t2.name <= t.name)) as acc_wsum
from (select name, SUM(weight) wsum
from cont
group by name) t
order by wsum desc
So, the best way to do this is using cumulative sum:
select t.*,
sum(wsum) over (order by wsum desc) as acc_wsum
from (<your summarized query>) t
The order by clause makes this cumulative.
If you don't have that capability (in SQL Server 2012 and Oracle), a correlated subquery is an easy way to do it, assuming the summed weights are distinct values:
select t.*,
(select sum(wsum) from (<your summarized query>) t2 where t2.wsum >= t.wsum) as acc_wsum
from (<your summarized query>) t
This should work in all dialects of SQL. To work with situations where the accumulated weights might have duplicates:
select t.*,
(select sum(wsum) from (<your summarized query>) t2 where t2.wsum > t.wsum or (t2.wsum = t.wsum and t2.name <= t.name) as acc_wsum
from (<your summarized query>) t
try this
;WITH CTE
AS
(
SELECT *,
ROW_NUMBER() OVER(ORDER BY wsum) rownum
FROM #table1
)
SELECT
c1.name,
c1.wsum,
acc_wsum= (SELECT SUM(c2.wsum)
FROM cte c2
WHERE c2.rownum <= c1.rownum)
FROM CTE c1;
or you can join instead of using subquery
;WITH CTE
AS
(
SELECT *,
ROW_NUMBER() OVER(ORDER BY usercount) rownum
FROM #table1
)
SELECT
c1.name,
c1.wsum,
acc_wsum= SUM(c2.wsum)
FROM CTE c1
INNER JOIN CTE c2 ON c2.rownum <= c1.rownum
GROUP BY c1.name, c1.wsum;