stratified sample on ranges - sql

I have table_1, that has data such as:
Range Start Range End Frequency
10 20 90
20 30 68
30 40 314
40 40 191 (here, it means we have just 40 as data point repeating 191 times)
table_2:
group value
10 56.1
10 88.3
20 53
20 20
30 55
I need to get the stratified sample on the basis of range from table_1, the table_2 can have millions of rows but the result should be restricted to just 10k points.
Tried below query:
SELECT
d.*
FROM
(
SELECT
ROW_NUMBER() OVER(
PARTITION BY group
ORDER BY group
) AS seqnum,
COUNT(*) OVER() AS ct,
COUNT(*) OVER(PARTITION BY group) AS cpt,
group, value
FROM
table_2 d
) d
WHERE
seqnum < 10000 * ( cpt * 1.0 / ct )
but a bit confused with the analytics functions usage here.
Expecting 10k records as a stratified sample from table_2:
Result table:
group value
10 56.1
20 53
20 20
30 55

It means you need atleast one record of each group and more records on random basis then try this:
SELECT GROUP, VALUE FROM
(SELECT T2.GROUP, T2.VALUE,
ROW_NUMBER()
OVER (PARTITION BY T2.GROUP ORDER BY NULL) AS RN
FROM TABLE_1 T1
JOIN TABLE_2 T2
ON(T1.RANGE = T2.GROUP))
WHERE RN = 1 OR
CASE WHEN RN > 1
AND RN = CEIL(DBMS_RANDOM.VALUE(1,RN))
THEN 1 END = 1
FETCH FIRST 10000 ROWS ONLY;
Here, Rownum is taken on random basis for each group and then result is taking rownum 1 and other rownum if they fulfill random condition.
Cheers!!

If I understand what you want - which is by no means certain - then I think you want to get a maximum of 10000 rows, with the number of group values proportional to the frequencies. So you can get the number of rows you want from each range with:
select range_start, range_end, frequency,
frequency/sum(frequency) over () as proportion,
floor(10000 * frequency/sum(frequency) over ()) as limit
from table_1;
RANGE_START RANGE_END FREQUENCY PROPORTION LIMIT
----------- ---------- ---------- ---------- ----------
10 20 90 .135746606 1357
20 30 68 .102564103 1025
30 40 314 .473604827 4736
40 40 191 .288084465 2880
Those limits don't quite add up to 10000; you could go slightly above with ceil instead of floor.
You can then assign a nominal row number to each entry in table_2 based on which range it is in, and then restrict the number of rows from that range via that limit:
with cte1 (range_start, range_end, limit) as (
select range_start, range_end, floor(10000 * frequency/sum(frequency) over ())
from table_1
),
cte2 (grp, value, limit, rn) as (
select t2.grp, t2.value, cte1.limit,
row_number() over (partition by cte1.range_start order by t2.value) as rn
from cte1
join table_2 t2
on (cte1.range_end > cte1.range_start and t2.grp >= cte1.range_start and t2.grp < cte1.range_end)
or (cte1.range_end = cte1.range_start and t2.grp = cte1.range_start)
)
select grp, value
from cte2
where rn <= limit;
...
9998 rows selected.
I've used order by t2.value in the row_number() call because it isn't clear how you want to pick which rows in the range you actually want; you might want to order by dbms_random.value or something else.
db<>fiddle with some artificial data.

Related

Aggregating rows in SQL with missing Booleans

I have the below SQL script which returns the following data from a PostgreSQL DB view table.
SELECT
"V_data".macaddr,
"V_data".sensorid,
"V_data".ts,
"V_data".velocity,
"V_data".temp,
"V_data".highspeed,
"V_data".hightemp,
"V_data".distance,
FROM
sensordb."V_data"
WHERE
"V_data".macaddr like '%abcdef'
AND
(
("V_data".sensorid = 'abc1') or ("V_data".sensorid = 'a2bc') or ("V_data".sensorid = 'ab3c')
)
AND
"V_data".ts >= 1616370867000
ORDER BY
"V_data".ts DESC;
Output
macaddr
sensorid
ts
velocity
temp
highspeed
hightemp
distance
abcdef
abc1
1616370867010
25
32
52
abcdef
a2bc
1616370867008
27
35
T
51
abcdef
ab3c
1616370867006
26
30
50
abcdef
abc1
1616370867005
24
36
T
50
abcdef
a2bc
1616370867004
27
31
50
abcdef
abc1
1616370867002
21
30
T
48
abcdef
ab3c
1616370867000
22
33
F
46
I want to aggregate the rows such that I have the latest readings per sensorid for ts, velocity, temp, distance.
For the Booleans highspeed and hightemp, I want the latest available Boolean value or an empty cell if no Boolean value was available.
Expected output
macaddr
sensorid
ts
velocity
temp
highspeed
hightemp
distance
abcdef
abc1
1616370867010
25
32
T
T
52
abcdef
a2bc
1616370867008
27
35
T
51
abcdef
ab3c
1616370867006
26
30
F
50
How could I simplify this task?
Thanks.
You can use DISTINCT ON (available only in PostgreSQL afaik) to simplify this query. You can do:
with
q as (
-- your query here
)
select
l.macaddr, l.sensorid, l.ts, l.velocity, l.temp,
s.highspeed, t.hightemp,
l.distance
from (
select distinct on (sensorid) *
from q
order by sensorid, ts desc
) l
left join (
select distinct on (sensorid) *
from q
where highspeed is not null
order by sensorid, ts desc
) s on s.sensorid = l.sensorid
left join (
select distinct on (sensorid) *
from q
where hightemp is not null
order by sensorid, ts desc
) t on t.sensorid = l.sensorid
Hmmm . . . For all but the boolean columns DISTINCT ON would work. But those booleans are tricky. You could use some tricks on booleans.
Instead, let's go for ROW_NUMBER() to get the most recent row. And fiddle with arrays to get the most recent boolean values:
SELECT d.macaddr, d.sensorid,
MAX(d.ts) as ts,
MAX(d.velocity) FILTER (WHERE seqnum = 1) as velocity,
MAX(d.temp) FILTER (WHERE seqnum = 1) as temp,
(ARRAY_REMOVE(ARRAY_AGG(d.highspeed ORDER BY ts DESC), NULL))[1] as highspeed,
(ARRAY_REMOVE(ARRAY_AGG(d.hightemp ORDER BY ts DESC), NULL))[1] as hightemp
MAX(d.distance) FILTER (WHERE seqnum = 1)
FROM (SELECT d.*,
ROW_NUMBER() OVER (PARTITION BY d.macaddr, d.sensorid ORDER BY ts DESC) as seqnum
FROM sensordb."V_data" d
WHERE d.macaddr like '%abcdef' AND
d.sensorid IN ('abc1', 'a2bc', 'ab3c') AND
d.ts >= 1616370867000
) d
GROUP BY d.macaddr, d.sensorid
ORDER BY d.ts DESC;

Limit top pair of columns

The data I am working with looks like below-
category_id subcategory_id date quantities
123 45 2020-02-01 500
123 45 2020-02-13 400
456 35 2020-05-09 350
456 35 2020-05-15 250
456 35 2020-06-18 200
.
.
.
n such columns
Quantities are sorted in descending order
I want to get the data (as seen above) for the first (top) 10 unique pairs of (category_id, subcategory_id). Just like we use limit 10 to get the first 10 records, I want to limit by the top 10 unique pairs of (category_id, subcategory_id) and get the all the data as seen above.
Below is for BigQuery Standard SQL
#standardSQL
SELECT * EXCEPT(rn) FROM (
SELECT *,
ROW_NUMBER() OVER(PARTITION BY category_id, subcategory_id ORDER BY quantities DESC) rn
FROM `project.dataset.table`
)
WHERE rn <= 10
Another - more BigQuery'ish alternative is below
#standardSQL
SELECT TopN.* FROM (
SELECT ARRAY_AGG(t ORDER BY quantities DESC LIMIT 10) topN
FROM `project.dataset.table` t
GROUP BY category_id, subcategory_id
) t, t.topN
If you want 10 rows, each with different category_id/subcategory_id pairs, then you can use:
select t.* except (seqnum)
from (select t.*,
row_number() over (partition by category_id, subcategory_id order by quantities desc) as seqnum
from t
) t
where seqnum = 1
order by quantities desc
limit 10;
This gets the first row (by quantities) for each id pair and then limits to the 10 largest values.

How to use this in sql -- > max(sum (paid * quantity )) to solve a query

How to get the max value order of each customer ?
select num, max(sum(paid*quantity))
from orders join
pizza
using (order#)
group by customer#;
table
num orderN price
-------- --- -------
1 109 30
1 118 25
3 101 30
3 115 27
4 107 23
5 100 17
5 129 16
output req-
num Pnum price
-------- --- -------
1 109 30
3 101 30
4 107 23
5 100 17
You want to select the record having the highest price in each group of nums.
If your RDBMS supports window functions, that's straight forward with ROW_NUMBER() :
SELECT num, pnum, price
FROM (
SELECT t.*, ROW_NUMBER OVER(PARTITION BY num ORDER BY price DESC) rn
FROM mytable t
) x
WHERE rn = 1
Else, you can take the following approach, that uses a NOT EXISTS condition with a correlated subquery to ensure that the record being joined in the one with the highest price for the current num :
SELECT num, pnum, price
FROM mytable t
WHERE NOT EXISTS (
SELECT 1 FROM mytable t1 WHERE t1.num = t.num AND t1.price > t.price
)

How can I select top 3 for each group based on another column in sqlite?

I'm trying to get top 3 most profitable UserIDs in each country in one table using sqlite. I'm not sure where to use LIMIT 3.
Here is the table I have:
Country | UserID | Profit
US 1 100
US 12 98
US 13 10
US 5 8
US 2 5
IR 9 95
IR 3 90
IR 8 70
IR 4 56
IR 15 40
the result should look like this:
Country | UserID | Profit
US 1 100
US 12 98
US 13 10
IR 9 95
IR 3 90
IR 8 70
One pretty simple method is:
select t.*
from t
where t.profit >= (select t2.profit
from t t2
where t2.country = t.country
order by t2.profit desc
limit 1 offset 2
);
This assumes at least three records for each country. You can get around that with coalesce():
select t.*
from t
where t.profit >= coalesce((select t2.profit
from t t2
where t2.country = t.country
order by t2.profit desc
limit 1 offset 2
), t.profit
);
Since SQLite doesn't support windows function, so you can write a subquery be a seqnum by Country, then get top 3
You can try this query.
select t.Country,t.UserID,t.Profit
from(
select t.*,
(select count(*)
from T t2
where t2.Country = t.Country and t2.Profit >= t.Profit
) as seqnum
from T t
)t
where t.seqnum <=3
sqlfiddle:https://www.db-fiddle.com/f/tmNhRLGG2oKqCKXJEDsjfe/0
LIMIT won't be usefull as it applies to a whole result set.
I would create an auxiliary column "CountryRank" like this:
SELECT *, (SELECT COUNT() FROM Data AS d WHERE d.Country=Data.Country AND d.Profit>Data.Country)+1 AS CountryRank
FROM Data;
And query on that result:
SELECT Country, UserID, Profit
FROM (
SELECT *, (SELECT COUNT() FROM Data AS d WHERE d.Country=Data.Country AND d.Profit>Data.Profit)+1 AS CountryRank FROM Data)
WHERE CountryRank<=3
ORDER BY Country, CountryRank;

Accumulate a summarized column

I could need some help with a SQL statement. So I have the table "cont" which looks like that:
cont_id name weight
----------- ---------- -----------
1 1 10
2 1 20
3 2 40
4 2 15
5 2 20
6 3 15
7 3 40
8 4 60
9 5 10
10 6 5
I then summed up the weight column and grouped it by the name:
name wsum
---------- -----------
2 75
4 60
3 55
1 30
5 10
6 5
And the result should have a accumulated column and should look like that:
name wsum acc_wsum
---------- ----------- ------------
2 75 75
4 60 135
3 55 190
1 30 220
5 10 230
6 5 235
But I didn't manage to get the last statement working..
edit: this Statement did it (thanks Gordon)
select t.*,
(select sum(wsum) from (select name, SUM(weight) wsum
from cont
group by name)
t2 where t2.wsum > t.wsum or (t2.wsum = t.wsum and t2.name <= t.name)) as acc_wsum
from (select name, SUM(weight) wsum
from cont
group by name) t
order by wsum desc
So, the best way to do this is using cumulative sum:
select t.*,
sum(wsum) over (order by wsum desc) as acc_wsum
from (<your summarized query>) t
The order by clause makes this cumulative.
If you don't have that capability (in SQL Server 2012 and Oracle), a correlated subquery is an easy way to do it, assuming the summed weights are distinct values:
select t.*,
(select sum(wsum) from (<your summarized query>) t2 where t2.wsum >= t.wsum) as acc_wsum
from (<your summarized query>) t
This should work in all dialects of SQL. To work with situations where the accumulated weights might have duplicates:
select t.*,
(select sum(wsum) from (<your summarized query>) t2 where t2.wsum > t.wsum or (t2.wsum = t.wsum and t2.name <= t.name) as acc_wsum
from (<your summarized query>) t
try this
;WITH CTE
AS
(
SELECT *,
ROW_NUMBER() OVER(ORDER BY wsum) rownum
FROM #table1
)
SELECT
c1.name,
c1.wsum,
acc_wsum= (SELECT SUM(c2.wsum)
FROM cte c2
WHERE c2.rownum <= c1.rownum)
FROM CTE c1;
or you can join instead of using subquery
;WITH CTE
AS
(
SELECT *,
ROW_NUMBER() OVER(ORDER BY usercount) rownum
FROM #table1
)
SELECT
c1.name,
c1.wsum,
acc_wsum= SUM(c2.wsum)
FROM CTE c1
INNER JOIN CTE c2 ON c2.rownum <= c1.rownum
GROUP BY c1.name, c1.wsum;