NTILE alternative for non-uniformely distributed data sets

NTILE alternative for non-uniformely distributed data sets - sql

I have a data set and want to display it, but it can be very huge (thousands of points), and I want to filter them. For example here is output for 1000+ points:
Now i use NTILE to get approximation, but it doesn't work as expexted if points are not distributed uniformly. And I get this output (NTILE with parameter 100):
How can I avoid this behaviour? SQL stored procedure is below:
ALTER PROCEDURE [dbo].[usp_GetSystemHealthCheckData]
#DateFrom datetime,
#DateTo datetime,
#EstimatedPointCount int
with recompile
AS
BEGIN
SET NOCOUNT ON;
set arithabort on
if #DateFrom IS NULL
RAISERROR ('#DateFrom cannot be NULL', 16, 1)
if #DateTo IS NULL
RAISERROR ('#DateTo cannot be NULL', 16, 1)
if #EstimatedPointCount IS NULL
RAISERROR ('#EstimatedPointCount cannot be NULL', 16, 1)
;With T as
(
SELECT *, GroupId = NTILE(#EstimatedPointCount) over (order by GeneratedOnUtc)
FROM SystemHealthCheckData
WHERE GeneratedOnUtc between #DateFrom AND #DateTo
)
SELECT CpuPercentPayload = AVG(CpuPercentPayload),
FreeRamMb = AVG(FreeRamMb),
FreeDriveMb = AVG(FreeDriveMb),
GeneratedOnUtc = CAST(AVG(CAST(GeneratedOnUtc AS DECIMAL( 18, 6))) AS DATETIME)
FROM T
GROUP BY GroupId
END

EDIT: new approach
You might split your load with NTILE and then calculate an average for each group? I splitted my set in 4 groups. This lets the query come back with 4 average values. The number of groups could be calculated from the number of points you have or could be done fix.
Something like this:
DECLARE #tbl TABLE(id INT IDENTITY, nmbr FLOAT);
INSERT INTO #tbl VALUES(5),(4.5),(4),(3.5),(3),(2.5),(2),(1.5),(1),(1.5),(1),(0.5),(0),(13),(2),(17),(5),(22),(24),(2),(3),(11);
SELECT tbl2.*
,AVG(nmbr) OVER(PARTITION BY tbl2.tile)
FROM
(
SELECT tbl.*
,NTILE(4) OVER(ORDER BY id) AS tile
FROM #tbl AS tbl
)AS tbl2
If you want it reduced to the group values only you could try this
SELECT AVG(nmbr),tbl2.tile
FROM
(
SELECT tbl.*
,NTILE(4) OVER(ORDER BY id) AS tile
FROM #tbl AS tbl
)AS tbl2
GROUP BY tbl2.tile
--old text
You maybe want to think about a sliding average... In this example I tried to rebuild your values (long linear falling and wild jumping at the end). You can set the #pre and #post variables to set the grade of "flatening".
In short: There is an average calculated for each element and its direct neighbours.
Be aware of the fact that you must add an ORDER BY to avoid random results...
DECLARE #tbl TABLE(id INT IDENTITY, nmbr FLOAT);
INSERT INTO #tbl VALUES(5),(4.5),(4),(3.5),(3),(2.5),(2),(1.5),(1),(1.5),(1),(0.5),(0),(13),(2),(17),(5),(22),(24),(2),(3),(11);
DECLARE #pre INT=3;
DECLARE #post INT=3;
SELECT tbl.*
,AvgBorders.*
,AvgSums.*
,AvgSlide.*
FROM #tbl AS tbl
CROSS APPLY
(
SELECT tbl.id-#pre AS AvgStart
,tbl.id + #post AS AvgEnd
) AS AvgBorders
CROSS APPLY
(
SELECT COUNT(nmbr) AS CountNmbr
,SUM(nmbr) AS SumNmbr
FROM #tbl AS tbl
WHERE tbl.id BETWEEN AvgStart AND AvgEnd
) as AvgSums
CROSS APPLY
(
select AvgSums.SumNmbr / AvgSums.CountNmbr As AvgValue
) As AvgSlide
;

Related

Inserting individual values into table based on a number

Here is my problem: I have a stored procedure in SQL Server 2012 which should do the following thing.
I will pass an input parameter #Range, and the stored procedure should insert values into a table starting from 0 to #Range-1.
CREATE PROC MyExample
(#Range INT)
AS
BEGIN
// Suppose the value of #Range is 100
// So I should do INSERT into MyTable Values(0,1,2,3,4,5,6,......99)
END
Any idea how to achieve this?

You can use while loop as below:
Declare #Index AS INT=0
WHILE #Index<#Range
BEGIN
INSERT into MyTable Values(#Index)
SET #Index=#Index+1
END

I am thinking your teacher may suspect why you use cte when you just learn a loop
CREATE PROC MyExample
(
#Range INT,
)
AS
BEGIN
;WITH numbers AS
(
SELECT 0 AS Value WHERE #Range >= 0 -- Validate the #Range value too, try 0 or negative values
UNION ALL SELECT Value + 1 FROM numbers WHERE Value + 1 < #Range
)
INSERT INTO MyTable
SELECT * FROM numbers
OPTION (MAXRECURSION 0)
END

And here is a set based approach:
CREATE PROC MyExample
(
#Range INT,
)
AS
BEGIN
INSERT INTO MyTable (Number)
SELECT TOP (#Range) ROW_NUMBER() OVER(ORDER BY (SELECT NULL)) - 1
FROM sys.objects s1
CROSS JOIN sys.objects s2
END
(Based on this SO post)

CREATE PROC MyExample
(
#Range INT,
)
AS
BEGIN
declare #RANGE_COUNT int
select #RANGE_COUNT =#Range
//Suppose the value of #Range is 100
while #RANGE_COUNT<>0
begin
//So I should do INSERT into MyTable Values(0,1,2,3,4,5,6,......99)
INSERT into MyTable Values(#Range)
set #RANGE_COUNT = RANGE_COUNT -1
end
END

Using tally table technique:
DECLARE #range INT = 100
SELECT TOP(#range) -1 + ROW_NUMBER() OVER(ORDER BY (SELECT NULL)) AS rn
FROM
(VALUES(0),(0),(0),(0),(0),(0),(0),(0),(0),(0)) t1(n) CROSS JOIN --10
(VALUES(0),(0),(0),(0),(0),(0),(0),(0),(0),(0)) t2(n) CROSS JOIN --100
(VALUES(0),(0),(0),(0),(0),(0),(0),(0),(0),(0)) t3(n) --1000
--...continue to cover all possible #range values

dynamic alias in sql server

I want query field with different alias in stored procedure
select COUNT(EmpCode) as CountEmp+#para
result shoud be
CountEmp1
45
CountEmp2
54
CountEmp1
76
Query loop in c# code:
select COUNT(EmpCode) where something = #something as CountEmp+#para

Approach without dynamic SQL:
--I create temp table for demonstration
DECLARE #some_table TABLE (
Something int,
EmpCode INT
)
INSERT INTO #some_table (Something, EmpCode)
VALUES (1, 10),(1, 22),(1, 12),(2, 12),(2, 30),(3, 65),(3, 15),(3, 11),(3, 5)
--Declare parameter we want to search
DECLARE #param int = 1
--Query
--In cte we select what we need based on parameter
;WITH cte AS (
SELECT 'CountEmp'+CAST(#param as nvarchar(10)) as SomeThing,
CAST(COUNT(EmpCode) as nvarchar(10)) as EmpCodeCount,
ROW_NUMBER() OVER (ORDER BY SomeThing ) as rn
FROM #some_table
WHERE SomeThing = #param
GROUP BY SomeThing
)
--And here comes UNION
SELECT SomeThing as Result
FROM (
SELECT SomeThing,rn
FROM cte
UNION ALL
SELECT EmpCodeCount ,rn
FROM cte
) as t
ORDER BY rn, SomeThing DESC
Output:
Result
------------------
CountEmp1
3
(2 row(s) affected)

Please try to make use of below code. Its working fine with SQL Server 2012.
IF OBJECT_ID ('temp..#Mytable') IS NOT NULL
CREATE TABLE #Mytable (ID INT IDENTITY (1,1),EmpCode INT)
DECLARE #max int ,#count int
SET #max =0;
DECLARE #str varchar(10)
INSERT #Mytable
(EmpCode)
VALUES
(10),
(45),
(35),
(63),
(56),
(65)
SET #count = (SELECT COUNT (ID) FROM #Mytable )
WHILE #count > #max
BEGIN
SET #max = #max+1
SET #str = CONVERT(varchar(10),#max)
EXEC('SELECT EmpCode AS Empcode'+#str+ ' FROM #Mytable WHERE ID = '+#str)
END

Splitting multiple delimited values into multiple rows [duplicate]

This question already has answers here:
SQL Server: Split operation
(5 answers)
Closed 6 years ago.
I have been looking for a solution in StackOverflow but didn't find anything useful. I am facing a issue and I hope anyone would like to help me out.
I have value like this:
Create table DemoRecords
(
CustID int identity (1,1),
CustomerName varchar(50),
CurrencyCode varchar(50),
CurrentBalance varchar(50),
DateValue varchar(50)
)
GO
INSERT INTO DemoRecords VALUES ('Mr. X', 'BDTýUSDýGBP','10500ý2500ý1050','20150101ý20150201ý20150301')
..and I need output like this: (Please take a look at the picture attached below)
Picture
Please don't suggest me to use CTE because there are more than 100 columns in that table.

Here is a function to split a string into rows. Below that is a query against your demorecords table that uses the function to get the requested result.
create function dbo.split
(
#delimited nvarchar(max),
#delimiter nvarchar(5)
)
returns #rows table
(
rownumber int not null identity(1,1),
value nvarchar(max) not null
)
as
begin
if #delimited is null return
declare #delr_len int = len(#delimiter)
declare #start_at int = 1
declare #end_at int
declare #deld_len int
while 1=1
begin
set #end_at = charindex(#delimiter,#delimited,#start_at)
set #deld_len = case #end_at when 0 then len(#delimited) else #end_at-#start_at end
insert into #rows (value) values( substring(#delimited,#start_at,#deld_len) );
if #end_at = 0 break;
set #start_at = #end_at + #delr_len
end
return
end
go
select custid, customername, currencycode=currencycode.value, currentbalance=currentbalance.value, datevalue=datevalue.value
from demorecords r
cross apply (select rownumber, value from dbo.split(r.currencycode,'ý') ) currencycode
cross apply (select rownumber, value from dbo.split(r.currentbalance,'ý') where rownumber = currencycode.rownumber ) currentbalance
cross apply (select rownumber, value from dbo.split(r.datevalue,'ý') where rownumber = currencycode.rownumber ) datevalue
If you have a column that may contain missing values, use an outer apply instead of an inner apply to join the result of the function for that column. In the following example, the DateValue column is missing value 3 and value 4.
INSERT INTO DemoRecords VALUES ('Mr. X', 'BDTýUSDýGBPýEUR','10500ý2500ý1050ý','ý')
select custid, customername, currencycode=currencycode.value, currentbalance=currentbalance.value, datevalue=datevalue.value
from demorecords r
cross apply (select rownumber, value from dbo.split(r.currencycode,'ý') ) currencycode
cross apply (select rownumber, value from dbo.split(r.currentbalance,'ý') where rownumber = currencycode.rownumber ) currentbalance
outer apply (select rownumber, value from dbo.split(r.datevalue,'ý') where rownumber = currencycode.rownumber ) datevalue
Alternatively, you could clean up your input to not be missing values. In the above example, I would expect DateValue to be 'ýýý' not 'ý'. If your situation allows it, you might prefer finding and fixing these and not using an outer join.

Select random rows and stop when a specific sum/total is reached

I'm using SQL Server 2012 and I'm trying to do something like this:
SELECT SUM(MILES) from tblName WHERE
mDate > = '03/01/2012' and
mDate <= '03/31/2012'
-- and...
/*
now I want to add here do until the SUM of Miles
is equal to or greater then '3250' and get the
results rows randomly
*/
So in other words, I want to select random rows from a table that have a specified from and to date and stop when the sum of miles is at or over the number: 3250

Since you're using SQL Server 2012, here is a much easier approach that doesn't require looping.
DECLARE #tbl TABLE(mDate DATE, Miles INT)
INSERT #tbl VALUES
('20120201', 520), ('20120312', 620),
('20120313', 720), ('20120314', 560),
('20120315', 380), ('20120316', 990),
('20120317', 1020), ('20120412', 520);
;WITH x AS
(
SELECT
mDate,
Miles,
s = SUM(Miles) OVER
(
ORDER BY NEWID() ROWS UNBOUNDED PRECEDING
)
FROM #tbl
WHERE mDate >= '20120301'
AND mDate < '20120401'
)
SELECT
mDate,
Miles,
s
FROM x
WHERE s <= 3250
ORDER BY s;
SQLfiddle demo - hit "Run SQL" multiple times to see random results.

You can do SELECT TOP x ... ORDER BY newid() to get a sum of random rows. The problem lies in determining 'x'. You can't even be sure that the largest value of 'x' (number of rows that match the query) will have a total large enough to meet your requirements without testing that first:
DECLARE #stopAt int
DECLARE #x int
DECLARE #result int
SET #stopAt = 3250
SET #x = 1
SELECT #result = SUM(MILES) from tblName
WHERE
mDate >= '03/01/2012' and
mDate <= '03/31/2012'
IF (#result < #stopAt)
SELECT NULL -- this can't be done
ELSE
BEGIN
WHILE (1=1)
BEGIN
SELECT TOP (#x) #result = SUM(MILES) FROM tblName
WHERE
mDate >= '03/01/2012' and
mDate <= '03/31/2012'
ORDER BY newid()
IF #result >= #stopAt
BREAK
SET #x = #x + 1
END
SELECT #result
END
Just a note about this - the algorithm starts and 1 and increments up until a suitable match is found. A more efficient approach (for larger sets of data) might include a binary type search that caches the lowest suitable result and returns when the deepest node (or an exact match) is found.

I can't think of a way without a TSQL While... loop. This in combination with the TSQL paging with ROW_NUMBER() should get you there.
http://www.mssqltips.com/sqlservertip/1175/page-through-sql-server-results-with-the-rownumber-function/
In the ROW_NUMBER query, sum the Miles into another MileSum column, then in the while loop select the set all the rows that correspond with the ROW_NUMBER query while accumulating these MileSum values into a variable. Terminate when the variable exceeds 3250.

Try
SELECT MILES
, RANK() OVER (ORDER BY NEWID()) yourRank
FROM #tblName
WHERE miles>3250
AND mDate >= '03/01/2012'
AND mDate <= '03/31/2012'
ORDER BY yourRank
and then you can add a TOP 5 or whatever you want.
You get those in random order for sure.

Just a sample code for you to understand the concept.
create table temp(intt int)
insert into temp values(1)
insert into temp values(2)
insert into temp values(3)
insert into temp values(4)
insert into temp values(5)
insert into temp values(6)
insert into temp values(7)
insert into temp values(8)
insert into temp values(9)
insert into temp values(10)
insert into temp values(11)
insert into temp values(12)
insert into temp values(13)
insert into temp values(14)
insert into temp values(15)
insert into temp values(16)
insert into temp values(17)
insert into temp values(18)
insert into temp values(19)
insert into temp values(20)
insert into temp values(21)
declare #sum int = 0;
declare #prevInt int = 0;
while(#sum<50)
begin
set #prevInt = (select top(1) intt from temp order by newid());
set #sum = #sum + #prevInt;
end
set #sum = #sum-#prevInt;
select #sum
drop table temp
The reason for this approach is that paging would not return wide spread result unless and until you have thousands of records because in it the data is grouped into pages and with less records, the same page is hit multiple number of times giving the same result.
Also, there might be cases when a blank page is hit giving 0 as the result.(i don't know why sometimes a blank page is hit.)

t-sql find specific value with a csv string

I need some on help a SQL Query. I have a column with values stored as comma separated values.
I need to write a query which finds the 3rd delimited item within each value in the column.
Is this possible to do this in a Select statement?
ex: ColumnValue: josh,Reg01,False,a0-t0,22/09/2010
So I will need to get the 3rd value (i.e.) False from the above string.

Yes.
Where #s is your string...
select
SUBSTRING (#s,
CHARINDEX(',',#s,CHARINDEX(',',#s)+1)+1,
CHARINDEX(',',#s,CHARINDEX(',',#s,CHARINDEX(',',#s)+1)+1)
-CHARINDEX(',',#s,CHARINDEX(',',#s)+1)-1)
Or more generically...
;with cte as
(
select 1 as Item, 1 as Start, CHARINDEX(',',#s, 1) as Split
union all
select cte.Item+1, cte.Split+1, nullif(CHARINDEX(',',#s, cte.Split+1),0) as Split
from cte
where cte.Split<>0
)
select SUBSTRING(#s, start,isnull(split,len(#s)+1)-start)
from cte
where Item = 3
Now store your data properly :)

Try this (assuming SQL Server 2005+)
DECLARE #t TABLE(ColumnValue VARCHAR(50))
INSERT INTO #t(ColumnValue) SELECT 'josh,Reg01,False,a0-t0,22/09/2010'
INSERT INTO #t(ColumnValue) SELECT 'mango,apple,bannana,grapes'
INSERT INTO #t(ColumnValue) SELECT 'stackoverflow'
SELECT ThirdValue = splitdata
FROM(
SELECT
Rn = ROW_NUMBER() OVER(PARTITION BY ColumnValue ORDER BY (SELECT 1))
,X.ColumnValue
,Y.splitdata
FROM
(
SELECT *,
CAST('<X>'+REPLACE(F.ColumnValue,',','</X><X>')+'</X>' AS XML) AS xmlfilter FROM #t F
)X
CROSS APPLY
(
SELECT fdata.D.value('.','varchar(50)') AS splitdata
FROM X.xmlfilter.nodes('X') as fdata(D)
) Y
)X WHERE X.Rn = 3
//Result
ThirdValue
False
bannana
Also it is not very clear from your question as what version of SQL Server you are using. In case you are using SQL SERVER 2000, you can go ahead with the below approach.
Step 1: Create a number table
CREATE TABLE dbo.Numbers
(
N INT NOT NULL PRIMARY KEY
);
GO
DECLARE #rows AS INT;
SET #rows = 1;
INSERT INTO dbo.Numbers VALUES(1);
WHILE(#rows <= 10000)
BEGIN
INSERT INTO dbo.Numbers SELECT N + #rows FROM dbo.Numbers;
SET #rows = #rows * 2;
END
Step 2: Apply the query below
DECLARE #t TABLE(ColumnValue VARCHAR(50))
INSERT INTO #t(ColumnValue) SELECT 'josh,Reg01,False,a0-t0,22/09/2010'
INSERT INTO #t(ColumnValue) SELECT 'mango,apple,bannana,grapes'
INSERT INTO #t(ColumnValue) SELECT 'stackoverflow'
--Declare a table variable to put the identity column and store the indermediate results
DECLARE #tempT TABLE(Id INT IDENTITY,ColumnValue VARCHAR(50),SplitData VARCHAR(50))
-- Insert the records into the table variable
INSERT INTO #tempT
SELECT
ColumnValue
,SUBSTRING(ColumnValue, Numbers.N,CHARINDEX(',', ColumnValue + ',', Numbers.N) - Numbers.N) AS splitdata
FROM #t
JOIN Numbers ON Numbers.N <= DATALENGTH(ColumnValue) + 1
AND SUBSTRING(',' + ColumnValue, Numbers.N, 1) = ','
--Project the filtered records
SELECT ThirdValue = X.splitdata
FROM
--The co-related subquery does the ROW_NUMBER() OVER(PARTITION BY ColumnValue)
(SELECT
Rn = (SELECT COUNT(*)
FROM #tempT t2
WHERE t2.ColumnValue=t1.ColumnValue
AND t2.Id<=t1.Id)
,t1.ColumnValue
,t1.splitdata
FROM #tempT t1)X
WHERE X.Rn =3
-- Result
ThirdValue
False
bannana
Also you can use Master..spt_Values for your number table
DECLARE #t TABLE(ColumnValue VARCHAR(50))
INSERT INTO #t(ColumnValue) SELECT 'josh,Reg01,False,a0-t0,22/09/2010'
INSERT INTO #t(ColumnValue) SELECT 'mango,apple,bannana,grapes'
INSERT INTO #t(ColumnValue) SELECT 'stackoverflow'
--Declare a table variable to put the identity column and store the indermediate results
DECLARE #tempT TABLE(Id INT IDENTITY,ColumnValue VARCHAR(50),SplitData VARCHAR(50))
-- Insert the records into the table variable
INSERT INTO #tempT
SELECT
ColumnValue
,SUBSTRING(ColumnValue, Number ,CHARINDEX(',', ColumnValue + ',', Number ) - Number) AS splitdata
FROM #t
JOIN master..spt_values ON Number <= DATALENGTH(ColumnValue) + 1 AND type='P'
AND SUBSTRING(',' + ColumnValue, Number , 1) = ','
--Project the filtered records
SELECT ThirdValue = X.splitdata
FROM
--The co-related subquery does the ROW_NUMBER() OVER(PARTITION BY ColumnValue)
(SELECT
Rn = (SELECT COUNT(*)
FROM #tempT t2
WHERE t2.ColumnValue=t1.ColumnValue
AND t2.Id<=t1.Id)
,t1.ColumnValue
,t1.splitdata
FROM #tempT t1)X
WHERE X.Rn =3
You can read about this from
1) What is the purpose of system table table master..spt_values and what are the meanings of its values?
2) Why (and how) to split column using master..spt_values?

You really need something like String.Split(',')(2) which unfortunately dos not exist in SQL but this may be helpful to you

You can make some test with this solution and the other ones but, I believe that using XML in such situations almost always gives to you best performance and insure less coding:
DECLARE #InPutCSV NVARCHAR(2000)= 'josh,Reg01,False,a0-t0,22/09/2010'
DECLARE #ValueIndexToGet INT=3
DECLARE #XML XML = CAST ('<d>' + REPLACE(#InPutCSV, ',', '</d><d>') + '</d>' AS XML);
WITH CTE(RecordNumber,Value) AS
(
SELECT ROW_NUMBER() OVER(ORDER BY T.v.value('.', 'NVARCHAR(100)') DESC) AS RecordNumber
,T.v.value('.', 'NVARCHAR(100)') AS Value
FROM #XML.nodes('/d') AS T(v)
)
SELECT Value
FROM CTE WHERE RecordNumber=#ValueIndexToGet
I can confirm that it takes 1 seconds to get value from CSV string with 100 000 values.

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

NTILE alternative for non-uniformely distributed data sets - sql

Related

Inserting individual values into table based on a number

dynamic alias in sql server

Splitting multiple delimited values into multiple rows [duplicate]

Select random rows and stop when a specific sum/total is reached

t-sql find specific value with a csv string

Categories

Resources