amazon redshift sql: remove top 1% of data as outliers - sql

i'm trying to remove 1% of data as we consider those outliers that greatly skew data. i've tried using SELECT TOP 99 PERC, but Amazon Redshift does not support percentages with TOP.
I've tried something like:
WITH
elapsed_times AS (
SELECT
COALESCE(anonymous_id, distinct_id) as id,
elapsed_time
FROM studio_production.interaction
WHERE project_id = '55062b464a9bc578006987ff'
),
max_elapsed_time AS (
SELECT elapsed_time
FROM elapsed_times
ORDER BY elapsed_time ASC
OFFSET ROUND(0.99 * (SELECT COUNT(*) FROM elapsed_times))
LIMIT 1
),
user_times AS (
SELECT
id,
LEAST(elapsed_time, max_elapsed_time) as elapsed_time
FROM elapsed_times
GROUP BY 1
)
SELECT AVG(elapsed_time) FROM user_times
But I get: argument of OFFSET must not contain subqueries
thus, my query is now:
WITH
elapsed_times AS (
SELECT
COALESCE(anonymous_id, distinct_id) as id,
elapsed_time,
RANK() OVER (ORDER BY elapsed_time ASC) as rnk
FROM studio_production.interaction
WHERE project_id = '55062b464a9bc578006987ff'
),
user_times AS (
SELECT
id,
LEAST(MAX(elapsed_time), (
SELECT MIN(elapsed_time)
FROM elapsed_times
WHERE rnk > ROUND(0.99 * (SELECT COUNT(*) FROM elapsed_times))
)) as elapsed_time
FROM elapsed_times
GROUP BY 1
)
SELECT AVG(elapsed_time) FROM user_times
which is actually quite slow. what is the correct approach to this problem?

You could use ntile() (see here):
select avg(elapsed_time)
from (select et.*,
ntile(100) over (order by elapsed_time) as thetile
from elapsed_times et
) et
where thetile not in (1, 100);
EDIT:
I admit that I often do this using row_number() and count():
select avg(elapsed_time)
from (select et.*,
row_number() over (order by elapsed_time) as seqnum,
count(*) over () as cnt
from elapsed_times et
) et
where (seqnum <= 0.01 * cnt) or (seqnum >= 0.99 * cnt);

Related

Finding Median Using MySQL

I know that there are many ways to find the median, but I am trying to use this method to find the median. Can someone explain to me why this does not work? The error here says "Invalid use of group function," but when I use HAVING instead of WHERE, the system doesn't recognize what RowNumber is. I'm very confused.
SELECT
ROUND(AVG(LS.LAT_N))
FROM(
SELECT
LAT_N,
ROW_NUMBER() OVER (ORDER BY LAT_N) AS RowNumber
FROM
STATION
) AS LS
WHERE
RowNumber IN (
IF(
FLOOR(COUNT(LS.LAT_N)/2+0.5) = CEIL(COUNT(LS.LAT_N)/2+0.5),
FLOOR(COUNT(LS.LAT_N)/2+0.5),
FLOOR(COUNT(LS.LAT_N)/2+0.5) AND CEIL(COUNT(LS.LAT_N)/2+0.5)
)
I typically write this as:
SELECT AVG(LAT_N)
FROM (SELECT LAT_N,
ROW_NUMBER() OVER (ORDER BY LAT_N) AS RowNumber,
COUNT(*) OVER () as cnt
FROM STATION
) s
WHERE 2 * RowNumber IN (CNT, CNT + 1, CNT + 2);
Here is a db<>fiddle.
The median is the middle element in an ordered series - or the average of the two middle elements if there is an even number.
SELECT
AVG(LAT_N)
FROM(
SELECT
LAT_N,
ROW_NUMBER() OVER (ORDER BY LAT_N) AS RowNumber
FROM
STATION
) AS q
WHERE
RowNumber >= FLOOR ( (SELECT COUNT(*) FROM STATION)/2 + 0.5)
AND
RowNumber <= CEIL ( (SELECT COUNT(*) FROM STATION)/2 + 0.5)
Here is dbfiddle https://dbfiddle.uk/?rdbms=mysql_8.0&fiddle=b31e08f4ece61ecb95d9dde76c389fb1

SQL select row with max value or distinct value and sum all

I have the following data that is returned to me. I need to get a distinct or max sum of all the commission by taxid for a single repnbr. The 'qtrlycommrep' column is the value I'm trying to get to, but not able to. For repnbr c590, I need to get the 854.66 commission amount, which is the max for each taxid.
What am I doing wrong?
Any help would be much appreciated!
Here's what I've tried so far. Using the Row_number
select distinct
sub.Repnbr
, (sub.QtrLYComm) as qtrlycommrep
from (
select distinct repnbr, QtrLYComm
, rn = row_number() over(partition by repnbr order by QtrLYComm desc)
from #qtrly
) sub
where sub.rn = 1
Cross Apply
select distinct
#qtrly.repnbr
, x.QtrLYComm as qtrlycommrep
from #qtrly
cross apply (
select top 1
*
from #qtrly as i
where i.repnbr = Repnbr
order by i.qtrlycomm desc
) as x;
inner join
select
#qtrly.repnbr, #qtrly.qtrlycomm as qtrlycommrep
from #qtrly
inner join (
select maxvalue = max(qtrlycomm), repnbr
from #qtrly
group by repnbr
) as m
on #qtrly.repnbr = m.repnbr
and #qtrly.qtrlycomm = m.maxvalue;
order by row_number
select top 1 with ties
#qtrly.repnbr, #qtrly.qtrlycomm as qtrlycommrep
from #qtrly
order by
row_number() over(partition by repnbr
order by qtrlycomm desc)
You want one value per tax id. You need to include that. For instance:
select q.Repnbr, sum(q.QtrLYComm) as qtrlycommrep
from (select q.*,
row_number() over(partition by repnbr, taxid order by QtrLYComm desc) as seqnum
from #qtrly q
) q
where seqnum = 1
group by q.Repnbr;
However, I would be inclined to use two levels of aggregation:
select q.Repnbr, sum(q.QtrLYComm) as qtrlycommrep
from (select distinct repnbr, taxid, QtrLYComm
from #qtrly q
) q
group by q.Repnbr;

Replacement for row_number() in clickhouse

Row_number () is not supported by clickhouse database, looking for a alternate function.
SELECT company_name AS company,
DOMAIN,
city_name AS city,
state_province_code AS state,
country_code AS country,
location_revenue AS revenueRange,
location_TI_industry AS industry,
location_employeecount_range AS employeeSize,
topic,
location_duns AS duns,
rank AS intensityRank,
dnb_status_code AS locationStatus,
rank_delta AS intensityRankDelta,
company_id,
ROW_NUMBER() OVER (PARTITION BY DOMAIN) AS rowNumberFROM company_intent c
WHERE c.rank > 0
AND c.rank <= 10
AND c.signal_count > 0
AND c.topic IN ('Cloud Computing')
AND c.country_code = 'US'
AND c.rank IN (7, 8, 9, 10)
GROUP BY c.location_duns,
company_name,
DOMAIN,
city_name,
state_province_code,
country_code,
location_revenue,
location_TI_industry,
location_employeecount_range,
topic,
rank,
dnb_status_code,
rank_delta,
company_id
ORDER BY intensityRank DESC
LIMIT 15 SELECT COUNT (DISTINCT c.company_id) AS COUNT
FROM company_intent c
WHERE c.rank > 0
AND c.rank <= 10
AND c.signal_count > 0
AND c.topic IN ('Cloud Computing')
AND c.country_code = 'US'
AND c.rank IN (7, 8, 9, 10)
When executed the above query got the below error.
Expected one of: SETTINGS, FORMAT, WITH, HAVING, LIMIT, FROM, PREWHERE, token, UNION ALL, Comma, WHERE, ORDER BY, INTO OUTFILE, GROUP BY
any suggestions is appreciated
Solution #1
SELECT
*,
rowNumberInAllBlocks()
FROM
(
-- YOUR SELECT HERE
)
https://clickhouse.com/docs/en/sql-reference/functions/other-functions/#rownumberinallblocks says:
rowNumberInAllBlocks() Returns the ordinal number of the row in the data block. This function only considers the affected data blocks.
Solution #2
SELECT
row_number() OVER (),
...
FROM
...
https://clickhouse.com/docs/en/sql-reference/window-functions/
In my tests, both solutions show identical results. However, you need to remember that at the beginning of 2022, window functions work in single-threaded mode.
ClickHouse doesn't support Window Functions for now. There is a rowNumberInAllBlocks function that might be interesting to you.
SELECT *, rowNumberInAllBlocks() as row_count FROM (SELECT .....)
smth like this (terrible lokks but works good)
SELECT *, rn +1 -min_rn current, max_rn - min_rn + 1 last FROM (
SELECT *, rowNumberInAllBlocks() rn FROM (
SELECT i_device, i_time
FROM tbl
ORDER BY i_device, i_time
) t
) t1 LEFT JOIN (
SELECT i_device, min(rn) min_rn, max(rn) max_rn FROM (
SELECT *, rowNumberInAllBlocks() rn FROM (
SELECT i_device, i_time
FROM tbl
ORDER BY i_device, i_time
) t
) t GROUP BY i_device
) t2 USING (i_device)

Select only 20 rows of every distinct name

I have a table in which I have over 1000+ rows, in which there is a column "AnaId", values of this column are repeated many times like name 003912 is repeated 85 times, name 003156 in repeated 70 time, I want to select maximum 20 rows of every distinct AnaID. I have no idea how to do it.
SELECT dbo.Analysis.AnaId, Analysis.CasNo, MoleculeId,
SUM(dbo.AnalysisSummary.Area) as TotalArea
FROM dbo.Analysis LEFT JOIN dbo.AnalysisSummary
ON dbo.AnalysisSummary.AnaId = dbo.Analysis.AnaId
WHERE dbo.Analysis.Sample like '%Oil%'
GROUP BY dbo.Analysis.AnaId,Analysis.CasNo, MoleculeId ORDER BY
TotalArea DESC
You can use row_number():
select t.*
from (select t.*, row_number() over (partition by name order by name) as seqnum
from t
) t
where seqnum <= 20;
With the edits to your question, you can do:
with t as (
<your query here without order by>
)
select t.*
from (select t.*, row_number() over (partition by name order by name) as seqnum
from t
) t
where seqnum <= 20;
If you have another table of names, you can also use cross apply:
select t.*
from names n cross apply
(select top 20 t.*
from t
where t.name = n.name
) t;
Using Rank()
select t.*
from (select t.*, rank() over (partition by name order by name) as seqnum
from t
) t
where seqnum <= 20;
Using Dense_Rank()
select t.*
from (select t.*, Dense_Rank() over (partition by name order by name) as seqnum
from t
) t
where seqnum <= 20;
Using Row_Number
select t.*
from (select t.*, row_number() over (partition by name order by name) as seqnum
from t
) t
where seqnum <= 20;
This will help uunderstand usage of each Special Functions
Base Code Credits:-#gordon

how to query the occuring time of the max wind from a database?

I want to find the occuring time of the max wind, the max wind, and total rain from a database. The database have three columns: observerTime, wind and rain, how to generate the SQL statement to get the result ?
select observerTime from t where wind = (select max(wind) from t)
or if you need the last date when it occures
select max(observerTime) from t where wind = (select max(wind) from t)
You don't mention the database, but one of the following is likely to work:
select top 1 *, (select sum(rain) from t) as TotalRain
order by wind desc
or:
select *, (select sum(rain) from t) as TotalRain
from t
order by wind desc
limit 1
or
select *, (select sum(rain) from t) as TotalRain
from (select *
from t
order by wind desc
) t
where rownum = 1
You should be able to use something like this:
select t1.observerTime,
t1.wind,
(select sum(rain) from yourtable) TotalRain
from yourtable t1
inner join
(
select max(wind) MaxWind
from yourtable
) t2
on t1.wind = t2.maxwind
See SQL Fiddle with Demo
Since you are using SQL Server, you can also use row_number():
select observertime,
wind,
(select sum(rain) from yourtable) TotalRain
from
(
select observertime,
wind,
rain,
row_number() over(order by wind desc) rn
from yourtable
) src
where rn = 1
See SQL Fiddle with Demo