Spark SQL Calculating wrong dense rank on large data sets - sql

I am calculating dense rank over a table of around 1M rows. The problem is that the resultant data set doesn't have correct calculating of ranks.
Why is it so?
The maximum element in dataset is not assigned 100 percentile or highest rank.
As we know spark sql works on data chunks/partitions of data to speed up processing
do you think this might be the issue?
Rank : dense_rank
Column A
Expected Rank A
Wrong Rank A
99
6
2
44
5
1
31
4
5
25
3
4
21
2
3
21
2
2
10
1
1
select named_struct('JobName','Traffic') AS headers,
named_struct('data',named_struct('companyId',CompanyId,'trafficPercentile',percentile,'pageViewsCount',pageViewsCount,'category','Traffic')) as payload
from (
select *,(dense_rank/dist_cnt) * 100 as percentile
from(
select CompanyId,pageViewsCount,(dense_rank() OVER (ORDER BY pageViewsCount)) as dense_rank,TotalEmployees,(select count(distinct(pageViewsCount)) from pv
where TotalEmployees >= 1 and TotalEmployees <= 10) as dist_cnt
from pv
where TotalEmployees >= 1 and TotalEmployees <= 10
)
union
select *,(dense_rank/dist_cnt) * 100 as percentile
from(
select CompanyId,pageViewsCount,(dense_rank() OVER (ORDER BY pageViewsCount)) as dense_rank,TotalEmployees,(select count(distinct(pageViewsCount)) from pv
where TotalEmployees >= 11 and TotalEmployees <= 50) as dist_cnt
from pv
where TotalEmployees >= 11 and TotalEmployees <= 50
)
union
select *,(dense_rank/dist_cnt) * 100 as percentile
from(
select CompanyId,pageViewsCount,(dense_rank() OVER (ORDER BY pageViewsCount)) as dense_rank,TotalEmployees,(select count(distinct(pageViewsCount)) from pv
where TotalEmployees >= 51 and TotalEmployees <= 200) as dist_cnt
from pv
where TotalEmployees >= 51 and TotalEmployees <= 200
)
union
select *,(dense_rank/dist_cnt) * 100 as percentile
from(
select CompanyId,pageViewsCount,(dense_rank() OVER (ORDER BY pageViewsCount)) as dense_rank,TotalEmployees,(select count(distinct(pageViewsCount)) from pv
where TotalEmployees >= 201 and TotalEmployees <= 500) as dist_cnt
from pv
where TotalEmployees >= 201 and TotalEmployees <= 500
)
union
select *,(dense_rank/dist_cnt) * 100 as percentile
from(
select CompanyId,pageViewsCount,(dense_rank() OVER (ORDER BY pageViewsCount)) as dense_rank,TotalEmployees,(select count(distinct(pageViewsCount)) from pv
where TotalEmployees >= 501 and TotalEmployees <= 1000) as dist_cnt
from pv
where TotalEmployees >= 501 and TotalEmployees <= 1000
)
union
select *,(dense_rank/dist_cnt) * 100 as percentile
from(
select CompanyId,pageViewsCount,(dense_rank() OVER (ORDER BY pageViewsCount)) as dense_rank,TotalEmployees,(select count(distinct(pageViewsCount)) from pv
where TotalEmployees >= 1001 and TotalEmployees <= 5000) as dist_cnt
from pv
where TotalEmployees >= 1001 and TotalEmployees <= 5000
)
union
select *,(dense_rank/dist_cnt) * 100 as percentile
from(
select CompanyId,pageViewsCount,(dense_rank() OVER (ORDER BY pageViewsCount)) as dense_rank,TotalEmployees,(select count(distinct(pageViewsCount)) from pv
where TotalEmployees >= 5001 and TotalEmployees <= 10000) as dist_cnt
from pv
where TotalEmployees >= 5001 and TotalEmployees <= 10000
)
union
select *,(dense_rank/dist_cnt) * 100 as percentile
from(
select CompanyId,pageViewsCount,(dense_rank() OVER (ORDER BY pageViewsCount)) as dense_rank,TotalEmployees,(select count(distinct(pageViewsCount)) from pv
where TotalEmployees >= 10001 and TotalEmployees <= 50000) as dist_cnt
from pv
where TotalEmployees >= 10001 and TotalEmployees <= 50000
)
union
select *,(dense_rank/dist_cnt) * 100 as percentile
from(
select CompanyId,pageViewsCount,(dense_rank() OVER (ORDER BY pageViewsCount)) as dense_rank,TotalEmployees,(select count(distinct(pageViewsCount)) from pv
where TotalEmployees >= 50001 and TotalEmployees <= 100000) as dist_cnt
from pv
where TotalEmployees >= 50001 and TotalEmployees <= 100000
)
union
select *,(dense_rank/dist_cnt) * 100 as percentile
from(
select CompanyId,pageViewsCount,(dense_rank() OVER (ORDER BY pageViewsCount)) as dense_rank,TotalEmployees,(select count(distinct(pageViewsCount)) from pv
where TotalEmployees >= 100001) as dist_cnt
from pv
where TotalEmployees >= 100001
) order by TotalEmployees
)
In this query I need to calculate percentile on blocks of data governed by totalEmployees individually.
Output count : 1M

Related

sum for each customers and create a summary table

I have Table A :
Customer_ID Card_number Amount_of_deals
1 221 100
1 222 350
2 223 200
3 334 700
3 344 650
4 544 1500
I want to create a new table with ranges of the amount and count of customers in each range.
the new table should be :
Range Number_of_customers
0-500 2
500-1000 0
1000-1500 2
How can I create this table?
Thanks in advance
You can use case expression with group by :
select (case when amount >= 0 and amount <= 500 then '0-500'
when amount > 500 and amount <= 1000 then '501-1000'
when amount > 1000 and amount <= 1500 then '1001-1500'
else '1500+'
end) as Range, count(distinct custmore_id) as Number_of_customers
from table t
group by (case when amount >= 0 and amount <= 500 then '0-500'
when amount > 500 and amount <= 1000 then '501-1000'
when amount > 1000 and amount <= 1500 then '1001-1500'
else '1500+'
end);
If you are using SQL Server then you can use apply & use SELECT .. INTO statement :
select range, count(distinct custmore_id) as Number_of_customers
into new_table
from table t cross apply
( values (case when amount >= 0 and amount <= 500 then '0-500'
when amount > 500 and amount <= 1000 then '501-1000'
when amount > 1000 and amount <= 1500 then '1001-1500'
else '1500+'
end)
) tt(range)
group by range;
Because you want 0 values, you need a left join somewhere. I would recommend:
select v.range, count(c.customer_id)
from (values ('0-500', 0, 500),
('501-1000', 500, 1000),
('1001-1500', 1000, 1500)
) v(range, lo, hi) left join
(select customer_id, sum(amount_of_deals) as num
from a
group by customer_id
) c
on c.num > v.lo and c.num <= v.hi
group by v.range
order by min( v.lo );
Here is a db<>fiddle.

SQL: selecting rows where column value changed last time

I need to get the latest date where number is changed I have this SQL statement
Select
a.group, a.date a.number
From
xx.dbo.list a
Where
a.group in ('10, '10NC', '210')
And a.date >= '2018-06-01'
And a.number > 0
And a. number <> (Select Top 1 b.number
From xxx.dbo.list b
Where b.group = a.group
And b.date >= '2018-06-01'
And b.number > 0
And b.date < a.date
Order by b.date desc)
order by a.date desc
I have a table that looks like this
Group date Number
--------------------------
10 2018-02-06 4
10 2018-04-06 4
10 2018-06-12 4
10NC 2018-02-06 68
10NC 2018-04-06 35
10NC 2018-06-11 35
10NC 2018-06-12 68
10NC 2018-06-13 35
210 2018-06-02 94
210 2018-06-04 100
210 2018-06-06 100
210 2018-06-07 93
I get this output now, but I only want to get the rows with X
Group date Number
------------------------------
10NC 2018-06-12 68
10NC 2018-06-13 35 X
210 2018-06-04 100
210 2018-06-07 93 X
Can anyone help?
You would use lag():
select a.*
from (select a.group, a.date, a.number, lag(a.number) over (partition by group order by date) as prev_number
From xx.dbo.list a
where a.group in ('10', '10NC', '210') And
a.date >= '2018-06-01' And
a.number > 0
) a
where prev_number <> number;
Is this what is Expected?
DECLARE #List TABLE ([Group] VARCHAR(100), [Date] DATE, Number INT)
INSERT INTO #List
SELECT '10','2018-02-06',4
UNION ALL
SELECT '10','2018-04-06',4
UNION ALL
SELECT '10','2018-06-12',4
UNION ALL
SELECT '10NC','2018-02-06',68
UNION ALL
SELECT '10NC','2018-04-06',35
UNION ALL
SELECT '10NC','2018-06-11',35
UNION ALL
SELECT '10NC','2018-06-12',68
UNION ALL
SELECT '10NC','2018-06-13',35
UNION ALL
SELECT '210','2018-06-02',94
UNION ALL
SELECT '210','2018-06-04',100
UNION ALL
SELECT '210','2018-06-06',100
UNION ALL
SELECT '210','2018-06-07',93
;WITH CTE AS
(
SELECT
*
,RN = ROW_NUMBER() OVER (Partition by [Group] ORDER BY [DATE] DESC)
FROM #List
WHERE
[Date] >= '2018-06-01'
AND [Group] in ('10', '10NC', '210')
And Number > 0
)
SELECT * FROM CTE WHERE RN = 1
Note: I am posting it directly in answer as i don't have enough reputation to ask questions in comments.

SELECT TOP N with distinct/unique field values

Based on the answer to a previous question of mine. I end up with a result set something like:
PartyName Risk SubTotal Total
A High 100 280
B Med 25 45
A Low 30 280
A Med 70 280
B Low 10 45
C High 110 170
C Med 60 170
D Low 30 30
A Med 80 280
B Low 10 45
What I need is to SELECT TOP N unique PartyName with highest Amounts, i.e if N = 2 the result should be:
PartyName Risk SubTotal Total
A High 100 280
A Low 30 280
A Med 70 280
C High 110 170
C Med 60 170
A Med 80 280
all entries with the highest N Total values.
Tried this:
SELECT DISTINCT TOP(10) s.PartyName, s.Risk, s.SubTotal, s2.Total
FROM
(SELECT PartyName, Risk, SUM(CAST(Amount AS DECIMAL)) SubTotal
FROM CustomerData
GROUP BY PartyName, Risk) S
LEFT JOIN
(SELECT PartyName, SUM(CAST(Amount AS DECIMAL)) Total
FROM CustomerData
GROUP BY PartyName) S2
ON S.PartyName = S2.Partyname
But doesn't work
Off the top off my head, maybe something like this:
if object_id('tempdb.dbo.#test') is not null drop table #test
create table #test
(
partyname varchar(50),
Risk varchar(50),
amount int
)
insert into #test
select 'A','High',50
union all select 'B','Med', 15
union all select 'A','Low', 12
union all select 'A','Med' , 43
union all select 'B','Low' , 65
union all select 'C','High', 12
union all select 'C','Med' , 789
union all select 'D','Low' , 12
union all select 'A' ,'Med', 34
union all select 'B' ,'Low', 43
SELECT
main.PartyName,
main.Risk,
main.SubTotal,
TotalValues.Total
FROM
--get party+risk+subtotal
(
SELECT PartyName, Risk, SUM(CAST(Amount AS DECIMAL)) SubTotal
FROM #test
GROUP BY PartyName, Risk
) main
--get total by partyname with a rownum to get top N, where N=2
INNER JOIN
(SELECT
b.partyName, b.Total, row_number() over (order by Total desc) as rid
FROM
(
SELECT b.PartyName, SUM(CAST(Amount AS DECIMAL)) as Total
FROM #test b
group by b.PartyName
) as b
) as TotalValues
on TotalValues.partyName = main.partyName
and TotalValues.rid <= 2 --n = 2
order by
main.partyname,
TotalValues.Total
1st we get a set of data with the totals, next we find the range of totals of interest, finally we get teh results...
Untested:
WITH mAgg AS (SELECT partyName
, Risk
, sum(cast(amount as decimal(10,2)) over (partition by partyName, Risk) as subTotal
, sum(cast(amount as decimal(10,2)) over (partition by partyName) as Total
FROM CustomerData),
mRange as (SELECT distinct top 2 total from mAgg order by total desc)
SELECT * FROM mAgg where Total >= (SELECT min(total)
FROM mRange))
Or maybe we could just dense_rank() over (partition by total desc) and then get the anything with rank >=2 or N...
I think this version should do what you want:
SELECT TOP (10) WITH TIES PartyName, Risk,
SUM(CAST(Amount AS DECIMAL)) as SubTotal,
MAX(SUM(CAST(Amount AS DECIMAL))) OVER (PARTITION BY PartyName) as Total
FROM CustomerData
GROUP BY PartyName, Risk
ORDER BY Total DESC, PartyName;
EDIT:
The above gives all rows tied with the 10th. If you want all rows with the 10 distinct values, then let's use DENSE_RANK():
SELECT cd.*
FROM (SELECT cd.*, DENSE_RANK() OVER (ORDER BY Total DESC) as seqnum
FROM (SELECT TOP (10) WITH TIES PartyName, Risk,
SUM(CAST(Amount AS DECIMAL)) as SubTotal,
MAX(SUM(CAST(Amount AS DECIMAL))) OVER (PARTITION BY PartyName) as Total
FROM CustomerData
GROUP BY PartyName, Risk
) cd
) cd
WHERE seqnum <= 10
ORDER BY Total DESC, PartyName;

return enddate where cumulative sum of fee column less than 1100 using sql

end date fee
-----------------
05-Sep-14 700
12-Sep-14 200
19-Sep-14 100
26-Sep-14 300
03-Oct-14 400
In the table shown here, I need to return enddate where cumulative sum of fee column is less than 1100 using SQL.
Example:
19-Sep-14 (700 + 200 + 100 < 1100)
SELECT TOP 1
t1.enddate,
t1.fee,
SUM(t2.fee) as cumulative_sum
FROM test t1
INNER JOIN tableName t2 on t1.enddate >= t2.enddate
GROUP BY t1.enddate, t1.fee
HAVING SUM(t2.fee) < 1100
ORDER BY t1.enddate DESC
Data sample
create view fees
as
select cast('05-Sep-14' as date) as end_date, 700 as fee
union all select '12-Sep-14', 200
union all select '19-Sep-14', 100
union all select '26-Sep-14', 300
union all select '03-Oct-14', 400
Solution
SELECT TOP 1
a.end_date,
SUM(b.fee) as cumulative
FROM fees a CROSS JOIN fees b
WHERE a.end_date >= b.end_date
GROUP BY a.end_date
HAVING SUM(b.fee) < 1100
ORDER BY end_date desc

Increment row depending on value of another column

I have a sql query below, where dtMontno could start from any month and am adding Row column manually as below :
SELECT COUNT(*) as count,
MONTH(TourTbl.DT_Started) as dtMonthno,
DATENAME(YYYY, TourTbl.DT_Started) as dtYear,
row_number() over (order by DATENAME(YYYY, TourTbl.DT_Started) asc,
MONTH(TourTbl.DT_Started) asc ) as Row
FROM TourTbl
INNER JOIN BranchTbl ON TourTbl.BranchID = BranchTbl.BranchID
INNER JOIN AgencyTbl on AgencyTbl.AgencyID = BranchTbl.AgencyID
WHERE Cancelled = 0 AND
(TourTbl.DT_Started >= '2010/03/15' and
TourTbl.DT_Started <= '2012/03/15') AND
AgencyTbl.AgencyID in ( 245 ) and
BranchRODID > 0
group by datename(M, TourTbl.DT_Started),
DATENAME(YYYY, TourTbl.DT_Started),
MONTH(TourTbl.DT_Started)
order by dtYear asc, dtMonthno asc
now my result is :
count dtMonthno dtYear Row
6 5 2011 1
8 6 2011 2
2 7 2011 3
23 8 2011 4
126 9 2011 5
101 10 2011 6
85 11 2011 7
92 12 2011 8
115 1 2012 9
102 2 2012 10
48 3 2012 11
Is there any way to start the Row column depending on the dtMonthno and increment by one in the example above would start from 5 and end in 15?
Thanks
Try changing the derivation of Row to:
row_number() over (order by YEAR(TourTbl.DT_Started) asc,
MONTH(TourTbl.DT_Started) asc ) +
min(YEAR(TourTbl.DT_Started)*12+MONTH(TourTbl.DT_Started)-1) OVER () % 12 as Row
You can add month of first DT_Started date:
SELECT COUNT(*) as count,
MONTH(TourTbl.DT_Started) as dtMonthno,
DATENAME(YYYY, TourTbl.DT_Started) as dtYear,
row_number() over (order by DATENAME(YYYY, TourTbl.DT_Started) asc,
MONTH(TourTbl.DT_Started) asc )
+ substring(min(DATENAME(YYYY, [TourTbl].DT_Started) + right ('0' + str (MONTH([TourTbl].DT_Started), 2), 2)) over (), 5, 2) - 1 as Row
FROM TourTbl
INNER JOIN BranchTbl ON TourTbl.BranchID = BranchTbl.BranchID
INNER JOIN AgencyTbl on AgencyTbl.AgencyID = BranchTbl.AgencyID
WHERE Cancelled = 0 AND
(TourTbl.DT_Started >= '2010/03/15' and
TourTbl.DT_Started <= '2012/03/15') AND
AgencyTbl.AgencyID in ( 245 ) and
BranchRODID > 0
group by datename(M, TourTbl.DT_Started),
DATENAME(YYYY, TourTbl.DT_Started),
MONTH(TourTbl.DT_Started)
order by dtYear asc, dtMonthno asc
I would truncate the dates to months and group by those values, then obtain years, months and row numbers based on the truncated dates:
SELECT
COUNT(*) AS count,
MONTH(GroupMonth) AS dtMonthno,
DATENAME(YYYY, GroupMonth) AS dtYear, /* why do you want year as a string? */
ROW_NUMBER() OVER (ORDER BY GroupMonth) + MONTH(MIN(GroupMonth) OVER ()) - 1 AS Row
FROM (
SELECT DATEADD(MONTH, DATEDIFF(MONTH, 0, TourTbl.DT_Started), 0) AS GroupMonth
FROM TourTbl
INNER JOIN BranchTbl ON TourTbl.BranchID = BranchTbl.BranchID
INNER JOIN AgencyTbl on AgencyTbl.AgencyID = BranchTbl.AgencyID
WHERE Cancelled = 0 AND
(TourTbl.DT_Started >= '2010/03/15' and
TourTbl.DT_Started <= '2012/03/15') AND
AgencyTbl.AgencyID in ( 245 ) and
BranchRODID > 0
) s
GROUP BY GroupMonth