Find the mode in BigQuery - google-bigquery

Find the mode in BigQuery - google-bigquery

The mode is the value that appears most often in a set.
I would like something like:
SELECT
t.id as t_id,
GROUP_CONCAT(t.value) as value_list,
MODE(t.value) AS value_mode
FROM dataset.table as t
GROUP BY t_id
such that, for example:
t_id value_list value_mode
1 2,2,2,3,6,6 2
How is that done?
EDIT: The value_list is just there for illustration purpose. Only need the mode

select id, value as value_list, v as value_mode
from (
select
id, value, v,
count(1) as c,
row_number() over(partition by id order by c desc) as top
from (
select id, value, split(value) as v
from dataset.table
)
group by id, value, v
)
where top = 1

I often have to find the mode of prices for respective groups (e.g. length and amps) to filter out sale prices and the like.
I typically use two methods both with creating an array and un-nesting it in order of frequency. One method I use is by a LIMIT another with an [OFFSET(0)] in case you want to get Nth values.
Both are included below:
WITH t AS (SELECT 18 AS length,
'HIGH' as amps,
99.95 price UNION ALL
SELECT 18, "HIGH", 99.95 UNION ALL
SELECT 18, "HIGH", 5.95 UNION ALL
SELECT 18, "LOW", 33.95 UNION ALL
SELECT 18, "LOW", 33.95 UNION ALL
SELECT 18, "LOW", 4.5 UNION ALL
SELECT 3, "HIGH", 77.95 UNION ALL
SELECT 3, "HIGH", 77.95 UNION ALL
SELECT 3, "HIGH", 9.99 UNION ALL
SELECT 3, "LOW", 44.95 UNION ALL
SELECT 3, "LOW", 44.95 UNION ALL
SELECT 3, "LOW", 5.65
)
SELECT
length,
amps,
-- By Limit
(SELECT x FROM UNNEST(price_array) x
GROUP BY x ORDER BY COUNT(*) DESC LIMIT 1 ) most_freq_price,
(SELECT x FROM UNNEST(price_array) x
GROUP BY x ORDER BY COUNT(*) ASC LIMIT 1 ) least_freq_price,
-- By Offset
ARRAY((SELECT x FROM UNNEST(price_array) x
GROUP BY x ORDER BY COUNT(*) DESC))[OFFSET(0)] most_freq_price_offset,
ARRAY((SELECT x FROM UNNEST(price_array) x
GROUP BY x ORDER BY COUNT(*) ASC))[OFFSET(0)] least_freq_price_offset
FROM (
SELECT
length,
amps,
ARRAY_AGG(price) price_array
FROM t
GROUP BY 1,2
)

For your example, this is how I would solve it:
SELECT x, w mode
FROM (
SELECT COUNT(*) c, w, ROW_NUMBER() OVER(ORDER BY c DESC) rn, FIRST(x) x
FROM (
SELECT SPLIT(x) w, x FROM (SELECT "2,2,2,3,6,6" x)
)
GROUP BY 2
)
WHERE rn=1
And with the GROUP_CONCAT within query:
SELECT gc, w mode
FROM (
SELECT COUNT(*) c, w, ROW_NUMBER() OVER(ORDER BY c DESC) rn, FIRST(gc) gc
FROM (
SELECT GROUP_CONCAT(w) OVER() gc, w
FROM (FLATTEN((
SELECT SPLIT(x) w, x FROM (SELECT "2,2,2,3,6,6" x)), w)
)
)
GROUP BY 2
)
WHERE rn=1
And handling partitions:
SELECT tid, gc value_list, w value_mode
FROM (
SELECT tid, COUNT(*) c, w, ROW_NUMBER() OVER(PARTITION BY tid ORDER BY c DESC) rn, FIRST(gc) gc
FROM (
SELECT tid, GROUP_CONCAT(w) OVER(PARTITION BY tid) gc, w
FROM (FLATTEN((
SELECT 1 tid, SPLIT(x) w, x FROM (SELECT "2,2,2,3,6,6" x)), w)
)
)
GROUP BY tid, w
)
WHERE rn=1

There is a direct function available now
approx_top_count()
Here is an example of its usage
https://cloud.google.com/bigquery/docs/reference/standard-sql/approximate_aggregate_functions#approx_top_count

Related

Get oldest modified price

I have a table where each rows contains product id (A), price (P) and modification date (D) in YYYYMMDD format.
Here is the table :
WITH temp_table AS (
select 744583 as a, 9.21 as p, 20210706 as d from sysibm.sysdummy1
union all
select 744583 as a, 9.21 as p, 20210630 as d from sysibm.sysdummy1
union all
select 744583 as a, 9.21 as p, 20210628 as d from sysibm.sysdummy1
union all
select 744583 as a, 9.04 as p, 20210604 as d from sysibm.sysdummy1
union all
select 744583 as a, 9.04 as p, 20210201 as d from sysibm.sysdummy1
union all
select 744583 as a, 9.21 as p, 20200407 as d from sysibm.sysdummy1
)
select *
from temp_table
what i have
What i would like to have is when the price changed for the last time. In this example, the third line :
enter image description here
How would you do that ?
Thanks,

One method uses lag() and then ordering:
select t.*
from (select t.*,
lag(p) over (order by d desc) as prev_p
from temp_table t
) t
where prev_p is null or prev_p <> p
order by d desc
fetch first 1 row only;
If you wanted to do this for multiple as at the same time, then there are different approaches. An interesting one uses a difference of row numbers:
select a, p, min(date)
from (select t.*,
row_number() over (partition by a order by date desc) as seqnum,
row_number() over (partition by a, p order by date desc) as seqnum_2
from temp_table t
) t
where seqnum = seqnum_2
group by a, p;
You can investigate why this works. The two row numbers are the same only for the last price for each a.

How to get the most frequent value in Google's Bigquery

Postgres has an easy function to achieve this, just using the mode() function we can find the most frequent value. Is there something equivalent within Google's Bigquery?
How could be written a query like this in Bigquery?
select count(*),
avg(vehicles) as mean,
percentile_cont(0.5) within group (order by vehicles) as median,
mode() within group (order by vehicles) as most_frequent_value
FROM "driver"
WHERE vehicles is not null;

Below is for BigQuery Standard SQL
Option 1
#standardSQL
SELECT * FROM (
SELECT COUNT(*) AS cnt,
AVG(vehicles) AS mean,
APPROX_TOP_COUNT(vehicles, 1)[OFFSET(0)].value AS most_frequent_value
FROM `project.dataset.table`
WHERE vehicles IS NOT NULL
) CROSS JOIN (
SELECT PERCENTILE_CONT(vehicles, 0.5) OVER() AS median
FROM `project.dataset.table`
WHERE vehicles IS NOT NULL
LIMIT 1
)
Option 2
#standardSQL
SELECT * FROM (
SELECT COUNT(*) cnt,
AVG(vehicles) AS mean
FROM `project.dataset.table`
WHERE vehicles IS NOT NULL
) CROSS JOIN (
SELECT PERCENTILE_CONT(vehicles, 0.5) OVER() AS median
FROM `project.dataset.table`
WHERE vehicles IS NOT NULL
LIMIT 1
) CROSS JOIN (
SELECT vehicles AS most_frequent_value
FROM `project.dataset.table`
WHERE vehicles IS NOT NULL
GROUP BY vehicles
ORDER BY COUNT(1) DESC
LIMIT 1
)
Option 3
#standardSQL
CREATE TEMP FUNCTION median(arr ANY TYPE) AS ((
SELECT PERCENTILE_CONT(x, 0.5) OVER()
FROM UNNEST(arr) x LIMIT 1
));
CREATE TEMP FUNCTION most_frequent_value(arr ANY TYPE) AS ((
SELECT x
FROM UNNEST(arr) x
GROUP BY x
ORDER BY COUNT(1) DESC
LIMIT 1
));
SELECT COUNT(*) cnt,
AVG(vehicles) AS mean,
median(ARRAY_AGG(vehicles)) AS median,
most_frequent_value(ARRAY_AGG(vehicles)) AS most_frequent_value
FROM `project.dataset.table`
WHERE vehicles IS NOT NULL
and so on ...

You can use APPROX_TOP_COUNT to get top values, e.g.:
SELECT APPROX_TOP_COUNT(vehicles, 5) AS top_five_vehicles
FROM dataset.driver
If you just want the top value, you can select it from the array:
SELECT APPROX_TOP_COUNT(vehicles, 1)[OFFSET(0)] AS most_frequent_value
FROM dataset.driver

No, there is no equivalent of the mode()-function in BigQuery, but you may define one yourself using any of the logics in the other answers to this thread. You could call it like so:
SELECT mode(`an_array`) AS top_count FROM `somewhere_with_arrays`
but this approach lead to multiple by-row sub-queries wihch is terrible for performance, so if you never grinded BQ to a halt before, you can do it with these functions. I it (the second) only for readability in quick-fixes for very small data-sets.
Check out the two UDF:s below. A third approach would be to implement a JS function in which case this oneliner should be usefull
return arr.sort((a,b) => arr.filter(v => v===a).length - arr.filter(v => v===b).length).pop();
This code establishes two mode()-like functions which eat arrays and return most common string:
CREATE TEMPORARY FUNCTION mode1(mystring ANY TYPE)
RETURNS STRING
AS
(
(
SELECT var FROM
( /* Count occurances of each value of input */
SELECT var, COUNT(*) AS n FROM
( /* Unnest and name*/
SELECT var FROM UNNEST(mystring) var
)
GROUP BY var /* Output is one of existing values */
ORDER BY n DESC /* Output is value with HIGHEST n */
) /* -------------------------------- */
LIMIT 1 /* Only ONE string is the output */
)
);
CREATE TEMPORARY FUNCTION mode2(inp ANY TYPE)
RETURNS STRING
AS
(
(
SELECT result.value FROM UNNEST( (SELECT APPROX_TOP_COUNT(v,1) AS result FROM UNNEST(inp) v)) result
)
);
SELECT
inp,
mode1(inp) AS first_logic_output,
mode2(inp) AS second_logic_output
FROM
(
/* Test data */
SELECT ['Erdős','Turán', 'Erdős','Turán','Euler','Erdős'] AS inp
UNION ALL
SELECT ['Euler','Euler', 'Gauss', 'Euler'] AS inp
)

The method I prefer is to query off of an array since you can easily adjust the criteria of the mode. Below are two example using both an offset and a limit method. With the offset you can grab the Nth most/least frequent value.
WITH t AS (SELECT 18 AS length,
'HIGH' as amps,
99.95 price UNION ALL
SELECT 18, "HIGH", 99.95 UNION ALL
SELECT 18, "HIGH", 5.95 UNION ALL
SELECT 18, "LOW", 33.95 UNION ALL
SELECT 18, "LOW", 33.95 UNION ALL
SELECT 18, "LOW", 4.5 UNION ALL
SELECT 3, "HIGH", 77.95 UNION ALL
SELECT 3, "HIGH", 77.95 UNION ALL
SELECT 3, "HIGH", 9.99 UNION ALL
SELECT 3, "LOW", 44.95 UNION ALL
SELECT 3, "LOW", 44.95 UNION ALL
SELECT 3, "LOW", 5.65
)
SELECT
length,
amps,
-- By Limit
(SELECT x FROM UNNEST(price_array) x
GROUP BY x ORDER BY COUNT(*) DESC LIMIT 1 ) most_freq_price,
(SELECT x FROM UNNEST(price_array) x
GROUP BY x ORDER BY COUNT(*) ASC LIMIT 1 ) least_freq_price,
-- By Offset
ARRAY((SELECT x FROM UNNEST(price_array) x
GROUP BY x ORDER BY COUNT(*) DESC))[OFFSET(0)] most_freq_price_offset,
ARRAY((SELECT x FROM UNNEST(price_array) x
GROUP BY x ORDER BY COUNT(*) ASC))[OFFSET(0)] least_freq_price_offset
FROM (
SELECT
length,
amps,
ARRAY_AGG(price) price_array
FROM t
GROUP BY 1,2
)

Bigquery- Struct format

WITH yourTable AS (
SELECT 1 AS id, '2013,1625,1297,7634' AS string_col UNION ALL
SELECT 2, '1,2,3,4,5'
)
SELECT id,
(SELECT ARRAY_AGG(CAST(num AS INT64))
FROM UNNEST(SPLIT(string_col)) AS num
) AS num,
ARRAY(SELECT CAST(num AS INT64)
FROM UNNEST(SPLIT(string_col)) AS num
) AS num_2
FROM yourTable
This is how exactly my actual table is designed and Now I would like to multiply num*num_2 and then later sum it up. Is there a way to get this into struct format like ID, nums.num,nums.num_2 so that I can simply multiply which gives me the necessary result.
PS: I am looking for solution in the select statement above but not within "with" statement.

Ok, assuming that you really have reason to have your table the way you have (see my comment on your question) - below should work
#standardSQL
SELECT id,
(
SELECT SUM(num * num_2)
FROM (SELECT pos, num FROM UNNEST(num) num WITH OFFSET pos) a
JOIN (SELECT pos_2, num_2 FROM UNNEST(num_2) num_2 WITH OFFSET pos_2) b
ON a.pos = b.pos_2
) mul
FROM yourTable
you can test it with below
#standardSQL
WITH yourTable AS (
SELECT 1 id, [2013,1625,1297,7634] num, [2013,1625,1297,7634] num_2 UNION ALL
SELECT 2, [1,2,3,4,5], [1,2,3,4,5]
)
SELECT id,
(
SELECT SUM(num * num_2)
FROM (SELECT pos, num FROM UNNEST(num) num WITH OFFSET pos) a
JOIN (SELECT pos_2, num_2 FROM UNNEST(num_2) num_2 WITH OFFSET pos_2) b
ON a.pos = b.pos_2
) mul
FROM yourTable

Grouping array elements in the correct order on PostgreSQL

Is it possible to group array elements in PostgreSQL?
Example, I have 2 related arrays like this (I say related because the first array indicates actions and the second array represents those action's times:
col0 = 'any_value'
col1 = array1['a','b','b','c','c','a','a','a','c']
col2 = array2[1,2,3,4,5,6,7,8,9]
and I would like to output the following result:
col0 = 'any_value'
array_result1['a','b','c','a','c']
array_result2[1,2,4,6,9]
A way the array can be unnested is by using ordinality, this is an example query, but it returns a distinct selection of the array elements which removes the repeated ones:
select col0,
array_agg(x order by rn) as unique_array1
from (
select
distinct on (col0, a.x) col0,
a.x,
a.rn
from table_a,
unnest(array1) with ordinality as a (x,rn)
order by 1,2,3
) unnested_ordered
group by col0;
So the result of this would be:
col0 = 'any_value'
array_result1['a','b','c']
But as you can see it is missing many elements.
EDIT:
To describe more my issue, In the end I would like to know when each of the array_result1 actions are initially done.
So for the example result
array_result1['a','b','c','a','c']
*array_result2[1,2,4,6,9]
*I supose the position of the array starts at 1 and not 0, i also fixed the last element, it should be 9 not 7
would help me to know, when did the first action 'a' happen and when did the second action 'a' happen so I can calculate the time for action 'a' to return into the path I am building.
So first time action 'a' happened was = 1
Second time it happened was = 6
So action 'a' appears twice in the path(array) and it takes 5 time units to re appear. That is why I need the second array with the times on which the actions happened (the first time each action happened)

You could use LATERAL and calculate group using ROW_NUMBER:
DROP TABLE IF EXISTS table_a;
CREATE TABLE table_a(col0 VARCHAR(10), col1 text[],col2 int[]);
INSERT INTO table_a(col0, col1, col2)
VALUES ('any_value',array['a','b','b','c','c','a','a','a','c'],
array[1,2,3,4,5,6,7,8,9]);
Main query:
SELECT col0,
col1,
unique_col1
FROM table_a,
LATERAL (SELECT ARRAY_AGG(x ORDER BY grp) AS unique_col1
FROM ( SELECT DISTINCT x,
rn - ROW_NUMBER() OVER(PARTITION BY x ORDER BY rn) AS grp
FROM unnest(col1) WITH ORDINALITY AS a(x,rn)
) AS sub
) AS lat1
Output:
EDIT:
Calculating second array:
SELECT col0,
col1,
unique_col1,
col2,
unique_col2
FROM table_a,
LATERAL (SELECT ARRAY_AGG(x ORDER BY grp) AS unique_col1
FROM ( SELECT DISTINCT x,
rn - ROW_NUMBER() OVER(PARTITION BY x ORDER BY rn) AS grp
FROM unnest(col1) WITH ORDINALITY AS a(x,rn)
) AS sub
) AS lat1,
LATERAL (
SELECT array_agg(x ORDER BY rn) AS unique_col2
FROM unnest(col2) WITH ORDINALITY AS b(x,rn)
WHERE rn IN (
SELECT SUM(c) OVER(ORDER BY grp) - (c-1) AS result
FROM (SELECT grp, COUNT(*) AS c
FROM ( SELECT x,
rn - ROW_NUMBER() OVER(PARTITION BY x ORDER BY rn) AS grp
FROM unnest(col1) WITH ORDINALITY AS a(x,rn)
) AS sub
GROUP BY grp) AS s
)
) AS lat2
Remark:
It generates second array from values, not its position, so when you have:
col2 = array[9,8,7,6,5,4,3,2,1]
you will get:
[9,8,6,4,1]
If you want only positions you could use:
...
LATERAL (
SELECT array_agg(result ORDER BY result) AS unique_col2
FROM (
SELECT SUM(c) OVER(ORDER BY grp) - (c-1) AS result
FROM (SELECT grp, COUNT(*) AS c
FROM ( SELECT x,
rn - ROW_NUMBER() OVER(PARTITION BY x ORDER BY rn) AS grp
FROM unnest(col1) WITH ORDINALITY AS a(x,rn)
) AS sub
GROUP BY grp) AS s
) AS s1
) AS lat2
And the result will be:
[1,2,4,6,9]
EDIT 2
In above version there is small mistake. The ARRAY_AGG should be ordered by rn not grp:
DROP TABLE IF EXISTS table_a;
CREATE TABLE table_a(col0 VARCHAR(10), col1 text[],col2 int[]);
INSERT INTO table_a(col0, col1, col2)
VALUES ('any_value',array['a','b','b','c','c','a','a','a','c'],
array[1,2,3,4,5,6,7,8,9]);
INSERT INTO table_a(col0, col1, col2)
VALUES ('any_value2',array['a','b','a','a','c','a'],array[1,2,3,4,5,6]);
SELECT *
FROM table_a,
LATERAL (SELECT ARRAY_AGG(x ORDER BY rn) AS unique_col1
FROM
(SELECT x, grp, MIN(rn) AS rn
FROM (SELECT x,
rn - ROW_NUMBER() OVER(PARTITION BY x ORDER BY rn) AS grp,
rn
FROM unnest(col1) WITH ORDINALITY AS a(x,rn)
) AS sub
GROUP BY x, grp) AS s
) AS lat1;

Find Top Most AND Lowest In a Table's Group Column

I have a table and there are 4 fields in it, ID, Price, QTY, Ratting and Optional [Position].
I have all the records Grouped By Columns [Qty,Ratting]
I have to define the position of groupwise and store that Position into Optional column.
For better understanding I have added an image with data in table:
On the basis of QTY in Each Rating I have to Mark Top3, Bottom3 and Rest of them as remaining.
I am not getting how to do it.
Can anybody suggest me how to do it?
So far what I've tried is:
Declare #RankTable TABLE
(
ID INT,
Price Decimal (10,2),
Qty INT,
Ratting INT
)
INSERT INTO #RankTable
SELECT 1,10,15,1
UNION ALL
SELECT 2,11,11,1
UNION ALL
SELECT 3,96,10,1
UNION ALL
SELECT 4,96,8,1
UNION ALL
SELECT 5,56,7,1
UNION ALL
SELECT 6,74,5,1
UNION ALL
SELECT 7,93,4,1
UNION ALL
SELECT 8,98,2,1
UNION ALL
SELECT 9,12,1,1
UNION ALL
SELECT 10,32,80,2
UNION ALL
SELECT 11,74,68,2
UNION ALL
SELECT 12,58,57,2
UNION ALL
SELECT 13,37,43,2
UNION ALL
SELECT 14,79,32,2
UNION ALL
SELECT 15,29,28,2
UNION ALL
SELECT 16,46,17,2
UNION ALL
SELECT 17,86,13,2
UNION ALL
SELECT 19,75,110,3
UNION ALL
SELECT 20,27,108,3
UNION ALL
SELECT 21,38,104,3
UNION ALL
SELECT 22,87,100,3
UNION ALL
SELECT 23,47,89,3
DECLARE #PositionGroup VARCHAR(1)
SELECT *,ISNULL(#PositionGroup,'') AS Position FROM #RankTable

You can try this:
SELECT ID
,Price
,Qty
,Ratting
,CASE WHEN RowID >= 1 AND RowID <= 3
THEN 0
ELSE CASE WHEN RowID > Total - 3 THEN 1 ELSE 2 END END AS Position
FROM (SELECT ID
,Price
,Qty
,Ratting
,COUNT(*) OVER(PARTITION BY Ratting) AS Total
,ROW_NUMBER() OVER(PARTITION BY Ratting ORDER BY Qty DESC) AS RowID
,ISNULL(#PositionGroup,'') AS Position
FROM #RankTable) AS T

Use Window Function. Try this.
;WITH cte
AS (SELECT *,
Row_number()OVER(partition BY rating ORDER BY id) rn,
count(id)OVER(partition BY rating) mx
FROM #RankTable)
SELECT ID,
Price,
Qty,
Rating,
mx - rn,
CASE WHEN rn IN ( 1, 2, 3 ) THEN 0
WHEN mx - rn IN( 0, 1, 2 ) THEN 1
ELSE 2
END position
FROM cte

try this as well.
;WITH cte AS
(
SELECT MAX(Row) [Max],
MIN(Row) [Min],
LU.Ratting
FROM (
SELECT *,
ROW_NUMBER() OVER(PARTITION BY Ratting ORDER BY Qty DESC) Row
FROM #RankTable)LU
GROUP BY LU.Ratting
)
SELECT ID,
R.Price,
R.Qty,
cte.Ratting,
CASE WHEN (Row - Min) <= 2 THEN 0 WHEN (Max - Row) <= 2 THEN 1 ELSE 2 END Position
FROM cte
JOIN (
SELECT Ratting,
ID,
Price,
Qty,
ROW_NUMBER() OVER(PARTITION BY Ratting ORDER BY Qty DESC) [Row]
FROM #RankTable
) R ON R.Ratting = cte.Ratting
Result:

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Find the mode in BigQuery - google-bigquery

select id, value as value_list, v as value_mode from ( select id, value, v, count(1) as c, row_number() over(partition by id order by c desc) as top from ( select id, value, split(value) as v from dataset.table ) group by id, value, v ) where top = 1

There is a direct function available now approx_top_count() Here is an example of its usage https://cloud.google.com/bigquery/docs/reference/standard-sql/approximate_aggregate_functions#approx_top_count

Related

Get oldest modified price

How to get the most frequent value in Google's Bigquery

Bigquery- Struct format

Grouping array elements in the correct order on PostgreSQL

Find Top Most AND Lowest In a Table's Group Column

Categories

Resources