Select Random Values for Grouped Dataset - sql

I'm no whizz at SQL. However I'm using the following query:
select count(*) as countis, avclassfamily
from malwarehashesandstrings
where behaviouralbinary IS true and
avclassfamily != 'SINGLETON'
group by avclassfamily
ORDER BY countis desc
LIMIT 50;
I would like to select 3 random hashes from the malwarehashsha256 column grouped by the avclassfamily column.
The following query works, question over:
select count(*) as countis,avclassfamily from malwarehashesandstrings where behaviouralbinary IS true and avclassfamily != 'SINGLETON' group by avclassfamily ORDER BY countis desc LIMIT 50;
virustotal=# select m.avclassfamily, m.cnt,
array_agg(malwarehashsha256)
from (select malwarehashesandstrings.*,
count(*) over (partition by avclassfamily) as cnt,
row_number() over (partition by avclassfamily order by random()) as seqnum
from malwarehashesandstrings
where behaviouralbinary and
avclassfamily <> 'SINGLETON'
) as m
where seqnum <= 3
group by m.avclassfamily, m.cnt ORDER BY m.cnt DESC LIMIT 50;

If I understand correctly, you can use row_number():
select m.*
from (select m.*,
row_number() over (partition by m.avclassfamily order by random()) as seqnum
from malwarehashesandstrings m
where m.behaviouralbinary and
m.avclassfamily <> 'SINGLETON'
) m
where seqnum <= 3;
If you want this in a column in your existing query, one method is:
select m.avgclassfamily, m.cnt,
array_agg(m.malwarehashsha256)
from (select m.*,
count(*) over (partition by m.avgclassfamily) as cnt,
row_number() over (partition by m.avclassfamily order by random()) as seqnum
from malwarehashesandstrings m
where m.behaviouralbinary and
m.avclassfamily <> 'SINGLETON'
) m
where seqnum <= 3
group by m.avgclassfamily, m.cnt;

Related

How to get the sum of all values?

select * from
(select * except(rn) from (select sistema_productivo, cod_mpio, periodo,
has_sembrada,has_sembrada_und, id_cultivo,anio, row_number() over(partition by cod_mpio order by
anio desc) rn from TABLENAME where SUBSTR(cod_mpio,1,2) in("23") and sistema_productivo in("ARROZ
SECANO MANUAL", "MAIZ TECNIFICADO") and periodo in("B")) t where rn = 1 and anio =(select max(anio)
as date_max, from (select * except(rn) from (select sistema_productivo, cod_mpio, periodo,
has_sembrada,has_sembrada_und, id_cultivo,anio, row_number() over(partition by cod_mpio order by
anio desc) rn from TABLENAME where SUBSTR(cod_mpio,1,2) in("23") and sistema_productivo in("ARROZ
SECANO MANUAL") and periodo in("B")) t where rn = 1))
)
This query returned
How Can I to get the total sum of has_sembrada from "MAIZ TECNIFICADO" and "ARROZ SECANO MANUAL"
select sistema_productivo, sum(has_sembrada) as total from
(....)
group by sistema_productivo

SQL select row with max value or distinct value and sum all

I have the following data that is returned to me. I need to get a distinct or max sum of all the commission by taxid for a single repnbr. The 'qtrlycommrep' column is the value I'm trying to get to, but not able to. For repnbr c590, I need to get the 854.66 commission amount, which is the max for each taxid.
What am I doing wrong?
Any help would be much appreciated!
Here's what I've tried so far. Using the Row_number
select distinct
sub.Repnbr
, (sub.QtrLYComm) as qtrlycommrep
from (
select distinct repnbr, QtrLYComm
, rn = row_number() over(partition by repnbr order by QtrLYComm desc)
from #qtrly
) sub
where sub.rn = 1
Cross Apply
select distinct
#qtrly.repnbr
, x.QtrLYComm as qtrlycommrep
from #qtrly
cross apply (
select top 1
*
from #qtrly as i
where i.repnbr = Repnbr
order by i.qtrlycomm desc
) as x;
inner join
select
#qtrly.repnbr, #qtrly.qtrlycomm as qtrlycommrep
from #qtrly
inner join (
select maxvalue = max(qtrlycomm), repnbr
from #qtrly
group by repnbr
) as m
on #qtrly.repnbr = m.repnbr
and #qtrly.qtrlycomm = m.maxvalue;
order by row_number
select top 1 with ties
#qtrly.repnbr, #qtrly.qtrlycomm as qtrlycommrep
from #qtrly
order by
row_number() over(partition by repnbr
order by qtrlycomm desc)
You want one value per tax id. You need to include that. For instance:
select q.Repnbr, sum(q.QtrLYComm) as qtrlycommrep
from (select q.*,
row_number() over(partition by repnbr, taxid order by QtrLYComm desc) as seqnum
from #qtrly q
) q
where seqnum = 1
group by q.Repnbr;
However, I would be inclined to use two levels of aggregation:
select q.Repnbr, sum(q.QtrLYComm) as qtrlycommrep
from (select distinct repnbr, taxid, QtrLYComm
from #qtrly q
) q
group by q.Repnbr;

How to find most frequent value in SQL column and return that value?

I was trying to do something like this:
select nume_produs
from incasari
group by id
having count(nume_produs) = max(count(nume_produs));
but it doesn't work
Do a GROUP BY. Order by count descending. Fetch the first row (highest count) only.
select nume_produs, count(*) as cnt
from incasari
group by nume_produs
order by cnt desc
fetch first 1 row with ties
For the most common value in the column:
select num_produs
from (select nume_produs, count(*) as cnt,
row_number() over (order by count(*)) as seqnum
from incasari
group by nume_produs
) i
where seqnum = 1;
If you want multiple values in the event of duplicates, use rank() instead of row_number().
If you want the most common value per id, then add partition by:
select num_produs
from (select nume_produs, count(*) as cnt,
row_number() over (partition by id order by count(*)) as seqnum
from incasari
group by nume_produs
) i
where seqnum = 1;
SELECT `nume_produs`,
COUNT(`nume_produs`) AS `value_occurrence`
FROM `incasari`
GROUP BY `nume_produs`
ORDER BY `value_occurrence` DESC
LIMIT 1;
Increase 1 if you want to see the N most common values of the column.

Issue with Hive Rank queries

select season,violation_code, cnt,
RANK() over (Partition BY season order by cnt desc) AS rank
from
( select season,violation_code,
count(*) as cnt
from ParkingViolations_seondary
group by season,violation_code
) tmp
where rank <= 3
I'm new to Hive. Can somebody help me what is wrong with the above query?
It throws the following error:
Error while compiling statement:
FAILED: SemanticException [Error 10004]: line 4:6 Invalid table alias
or column reference 'rank': (possible column names are: season,
violation_code, cnt)
Any quick help would be appreciated.
Use subquery to be able to address rank in the where clause:
select season, violation_code, cnt, rnk
from
( select season,violation_code, cnt,
RANK() over (Partition BY season order by cnt desc) AS rnk
from
( select season,violation_code,
count(*) as cnt
from ParkingViolations_seondary
group by season,violation_code
) tmp
)s
where rnk <= 3
Yes i was also able to get it working with the following:
SELECT * FROM
(
SELECT season,violation_code, cnt, RANK() over (Partition BY season ORDER BY cnt DESC) AS frequency
FROM
(SELECT season,violation_code, COUNT(*) as cnt FROM ParkingViolations_seondary
WHERE (violation_code <> 0) and (street_code1 <> 0 or street_code2 <> 0 or street_code3 <> 0)
GROUP BY season,violation_code)TMP
)TMP1
WHERE frequency <= 3;

Select only 20 rows of every distinct name

I have a table in which I have over 1000+ rows, in which there is a column "AnaId", values of this column are repeated many times like name 003912 is repeated 85 times, name 003156 in repeated 70 time, I want to select maximum 20 rows of every distinct AnaID. I have no idea how to do it.
SELECT dbo.Analysis.AnaId, Analysis.CasNo, MoleculeId,
SUM(dbo.AnalysisSummary.Area) as TotalArea
FROM dbo.Analysis LEFT JOIN dbo.AnalysisSummary
ON dbo.AnalysisSummary.AnaId = dbo.Analysis.AnaId
WHERE dbo.Analysis.Sample like '%Oil%'
GROUP BY dbo.Analysis.AnaId,Analysis.CasNo, MoleculeId ORDER BY
TotalArea DESC
You can use row_number():
select t.*
from (select t.*, row_number() over (partition by name order by name) as seqnum
from t
) t
where seqnum <= 20;
With the edits to your question, you can do:
with t as (
<your query here without order by>
)
select t.*
from (select t.*, row_number() over (partition by name order by name) as seqnum
from t
) t
where seqnum <= 20;
If you have another table of names, you can also use cross apply:
select t.*
from names n cross apply
(select top 20 t.*
from t
where t.name = n.name
) t;
Using Rank()
select t.*
from (select t.*, rank() over (partition by name order by name) as seqnum
from t
) t
where seqnum <= 20;
Using Dense_Rank()
select t.*
from (select t.*, Dense_Rank() over (partition by name order by name) as seqnum
from t
) t
where seqnum <= 20;
Using Row_Number
select t.*
from (select t.*, row_number() over (partition by name order by name) as seqnum
from t
) t
where seqnum <= 20;
This will help uunderstand usage of each Special Functions
Base Code Credits:-#gordon