How to create multiple arrays with random numbers in BigQuery

How to create multiple arrays with random numbers in BigQuery - google-bigquery

I have the following piece of code, in which I create 3 random arrays of length 5, 10, and 15 respectively.
with example as (
select 1 as id, mod(cast(10*rand() as int64), 10) as random_array from unnest(generate_array(1, 5))
union all select 2 as id, mod(cast(10*rand() as int64), 10) from unnest(generate_array(1, 10))
union all select 3 as id, mod(cast(10*rand() as int64), 10) from unnest(generate_array(1, 15))
)
select
id,
array_agg(random_array) as nested_numbers
from unnesting_example
group by id
Is there a (smarter) way to do this by creating the arrays within the with clause? So, without using an extra select? Thanks.

You can use STRUCT
SELECT * FROM UNNEST([
STRUCT(1 AS id, ARRAY((SELECT mod(cast(10*rand() as int64), 10) FROM UNNEST(GENERATE_ARRAY(1, 5)))) AS random_arr),
STRUCT(2 AS id, ARRAY((SELECT mod(cast(10*rand() as int64), 10) FROM UNNEST(GENERATE_ARRAY(1, 10)))) AS random_arr),
STRUCT(3 AS id, ARRAY((SELECT mod(cast(10*rand() as int64), 10) FROM UNNEST(GENERATE_ARRAY(1, 15)))) AS random_arr)
])
Output
id
random_arr
1
[2,2,7,3,5]
2
[8,5,8,2,1,1,7,3,9,8]
3
[9,1,7,0,2,1,6,6,7,8,8,0,2,4,1]

Consider below
select id,
array(
select mod(cast(10 * rand() as int64), 10)
from unnest(generate_array(1, len) )
) as random_arr
from unnest([struct<id int64, len int64>(1,5), (2,10), (3,15)])
with output

Related

If Array_Agg Struct have duplicate data how we can delete it in Big Query

If Array_Agg Struct have duplicate data how we can delete it in Big Query Eg. (1 ,[1,2,3] )`` (2, [3,4,5]) (1, [1,2,3])
Required Output` (1 ,[1,2,3] ), (2, [3,4,5]),

To remove duplicates in your array of structs, you can unnest your arrays to flatten the data and then select distinct values. When duplicates are removed, recreate the arrays and structs. See approach below:
with sample_data as (
select struct(1 as col1,[1,2,3] as col2) as test
union all select struct(2 as col1 ,[3,4,5] as col2) as test
union all select struct(1 as col1,[1,2,3] as col2) as test
),
t1 as (
select array_agg(test) as arr_of_structs from sample_data
),
-- disregard query above since it is for generating the sample data
remove_dups as (
select
distinct d_1.col1, d_2
from t1,
unnest(arr_of_structs) as d_1,
unnest(d_1.col2) as d_2
),
rebuild_struct as (
select
struct(col1,array_agg(d_2) as col2) as test
from remove_dups
group by col1
)
select array_agg(test) as arr_of_structs from rebuild_struct
Output:

one more option:
with your_table as (
select ([struct(1 as id ,[1,2,3] as arr), (2, [3,4,5]), (1, [1,2,3])]) array_agg_col
)
select array(
select any_value(x)
from t.array_agg_col x
group by format('%t', x)
) new_col
from your_table t
with output

You may consider this:
WITH sample_table AS (
SELECT [STRUCT(1 AS int_field, [1,2,3] AS arr_field), (2, [3,4,5]), (1, [1,2,3])] array_agg_col
)
SELECT ARRAY(
SELECT AS STRUCT *
FROM t.array_agg_col t1
QUALIFY ROW_NUMBER() OVER (PARTITION BY TO_JSON_STRING(t1)) = 1
) deduped_col
FROM sample_table t;

Distinct key with ARRAY_CONCAT with Struct<String, String>

I need to do the following with 2 array fields in the table below. The arrays are of type Struct<String, String>.
Merge the arrays together
If there is a duplicate key between the labels.key and project.key, then I only want to keep the kvp from the labels field
flatten the combined array into a delimited string an order them (so I can group by)
Example Table Data
SELECT 1 as id, ARRAY
[STRUCT("testlabel2" as key, "thisvalueisbetter" as value), STRUCT("testlabel3", "testvalue3")] as labels,
[STRUCT("testlabel2" as key, "testvalue2" as value)] as project
The below query does everything except #2 and I'm not sure how to accomplish that. Does anyone have a suggestion on how to do this?
SELECT
id,
(SELECT STRING_AGG(DISTINCT CONCAT(l.key, ':', l.value) ORDER BY CONCAT(l.key, ':', l.value))
FROM UNNEST(
ARRAY_CONCAT(labels, project)) AS l) AS label,
FROM `mytestdata` AS t
GROUP BY id, label
Currently this query gives the output:
1 testlabel2:testvalue2,testlabel2:thisvalueisbetter,testlabel3:testvalue3
But I'm looking for:
1 testlabel2:thisvalueisbetter,testlabel3:testvalue3

Below is for BigQuery Standard SQL
#standardSQL
SELECT *,
ARRAY(
SELECT AS STRUCT key, ARRAY_AGG(value ORDER BY source LIMIT 1)[OFFSET(0)] AS value
FROM (
SELECT 0 AS source, * FROM t.labels UNION ALL
SELECT 1, * FROM t.project
)
GROUP BY key
) AS combined_array
FROM `project.dataset.table` t
You can test, play with above using sample data from your question as in below example
#standardSQL
WITH `project.dataset.table` AS (
SELECT ARRAY
[STRUCT("testlabel2" AS key, "thisvalueisbetter" AS value), STRUCT("testlabel3", "testvalue3")] AS labels,
[STRUCT("testlabel2" AS key, "testvalue2" AS value)] AS project
)
SELECT *,
ARRAY(
SELECT AS STRUCT key, ARRAY_AGG(value ORDER BY source LIMIT 1)[OFFSET(0)] AS value
FROM (
SELECT 0 AS source, * FROM t.labels UNION ALL
SELECT 1, * FROM t.project
)
GROUP BY key
) AS combined_array
FROM `project.dataset.table` t
with result
Or ... to fully match your expected output - use below
#standardSQL
SELECT *,
(SELECT STRING_AGG(x) FROM (
SELECT CONCAT(key, ':', ARRAY_AGG(value ORDER BY source LIMIT 1)[OFFSET(0)]) x
FROM (
SELECT 0 AS source, * FROM t.labels UNION ALL
SELECT 1, * FROM t.project
)
GROUP BY key
)) AS combined_result
FROM `project.dataset.table` t
with result

How to get the most frequent value in Google's Bigquery

Postgres has an easy function to achieve this, just using the mode() function we can find the most frequent value. Is there something equivalent within Google's Bigquery?
How could be written a query like this in Bigquery?
select count(*),
avg(vehicles) as mean,
percentile_cont(0.5) within group (order by vehicles) as median,
mode() within group (order by vehicles) as most_frequent_value
FROM "driver"
WHERE vehicles is not null;

Below is for BigQuery Standard SQL
Option 1
#standardSQL
SELECT * FROM (
SELECT COUNT(*) AS cnt,
AVG(vehicles) AS mean,
APPROX_TOP_COUNT(vehicles, 1)[OFFSET(0)].value AS most_frequent_value
FROM `project.dataset.table`
WHERE vehicles IS NOT NULL
) CROSS JOIN (
SELECT PERCENTILE_CONT(vehicles, 0.5) OVER() AS median
FROM `project.dataset.table`
WHERE vehicles IS NOT NULL
LIMIT 1
)
Option 2
#standardSQL
SELECT * FROM (
SELECT COUNT(*) cnt,
AVG(vehicles) AS mean
FROM `project.dataset.table`
WHERE vehicles IS NOT NULL
) CROSS JOIN (
SELECT PERCENTILE_CONT(vehicles, 0.5) OVER() AS median
FROM `project.dataset.table`
WHERE vehicles IS NOT NULL
LIMIT 1
) CROSS JOIN (
SELECT vehicles AS most_frequent_value
FROM `project.dataset.table`
WHERE vehicles IS NOT NULL
GROUP BY vehicles
ORDER BY COUNT(1) DESC
LIMIT 1
)
Option 3
#standardSQL
CREATE TEMP FUNCTION median(arr ANY TYPE) AS ((
SELECT PERCENTILE_CONT(x, 0.5) OVER()
FROM UNNEST(arr) x LIMIT 1
));
CREATE TEMP FUNCTION most_frequent_value(arr ANY TYPE) AS ((
SELECT x
FROM UNNEST(arr) x
GROUP BY x
ORDER BY COUNT(1) DESC
LIMIT 1
));
SELECT COUNT(*) cnt,
AVG(vehicles) AS mean,
median(ARRAY_AGG(vehicles)) AS median,
most_frequent_value(ARRAY_AGG(vehicles)) AS most_frequent_value
FROM `project.dataset.table`
WHERE vehicles IS NOT NULL
and so on ...

You can use APPROX_TOP_COUNT to get top values, e.g.:
SELECT APPROX_TOP_COUNT(vehicles, 5) AS top_five_vehicles
FROM dataset.driver
If you just want the top value, you can select it from the array:
SELECT APPROX_TOP_COUNT(vehicles, 1)[OFFSET(0)] AS most_frequent_value
FROM dataset.driver

No, there is no equivalent of the mode()-function in BigQuery, but you may define one yourself using any of the logics in the other answers to this thread. You could call it like so:
SELECT mode(`an_array`) AS top_count FROM `somewhere_with_arrays`
but this approach lead to multiple by-row sub-queries wihch is terrible for performance, so if you never grinded BQ to a halt before, you can do it with these functions. I it (the second) only for readability in quick-fixes for very small data-sets.
Check out the two UDF:s below. A third approach would be to implement a JS function in which case this oneliner should be usefull
return arr.sort((a,b) => arr.filter(v => v===a).length - arr.filter(v => v===b).length).pop();
This code establishes two mode()-like functions which eat arrays and return most common string:
CREATE TEMPORARY FUNCTION mode1(mystring ANY TYPE)
RETURNS STRING
AS
(
(
SELECT var FROM
( /* Count occurances of each value of input */
SELECT var, COUNT(*) AS n FROM
( /* Unnest and name*/
SELECT var FROM UNNEST(mystring) var
)
GROUP BY var /* Output is one of existing values */
ORDER BY n DESC /* Output is value with HIGHEST n */
) /* -------------------------------- */
LIMIT 1 /* Only ONE string is the output */
)
);
CREATE TEMPORARY FUNCTION mode2(inp ANY TYPE)
RETURNS STRING
AS
(
(
SELECT result.value FROM UNNEST( (SELECT APPROX_TOP_COUNT(v,1) AS result FROM UNNEST(inp) v)) result
)
);
SELECT
inp,
mode1(inp) AS first_logic_output,
mode2(inp) AS second_logic_output
FROM
(
/* Test data */
SELECT ['Erdős','Turán', 'Erdős','Turán','Euler','Erdős'] AS inp
UNION ALL
SELECT ['Euler','Euler', 'Gauss', 'Euler'] AS inp
)

The method I prefer is to query off of an array since you can easily adjust the criteria of the mode. Below are two example using both an offset and a limit method. With the offset you can grab the Nth most/least frequent value.
WITH t AS (SELECT 18 AS length,
'HIGH' as amps,
99.95 price UNION ALL
SELECT 18, "HIGH", 99.95 UNION ALL
SELECT 18, "HIGH", 5.95 UNION ALL
SELECT 18, "LOW", 33.95 UNION ALL
SELECT 18, "LOW", 33.95 UNION ALL
SELECT 18, "LOW", 4.5 UNION ALL
SELECT 3, "HIGH", 77.95 UNION ALL
SELECT 3, "HIGH", 77.95 UNION ALL
SELECT 3, "HIGH", 9.99 UNION ALL
SELECT 3, "LOW", 44.95 UNION ALL
SELECT 3, "LOW", 44.95 UNION ALL
SELECT 3, "LOW", 5.65
)
SELECT
length,
amps,
-- By Limit
(SELECT x FROM UNNEST(price_array) x
GROUP BY x ORDER BY COUNT(*) DESC LIMIT 1 ) most_freq_price,
(SELECT x FROM UNNEST(price_array) x
GROUP BY x ORDER BY COUNT(*) ASC LIMIT 1 ) least_freq_price,
-- By Offset
ARRAY((SELECT x FROM UNNEST(price_array) x
GROUP BY x ORDER BY COUNT(*) DESC))[OFFSET(0)] most_freq_price_offset,
ARRAY((SELECT x FROM UNNEST(price_array) x
GROUP BY x ORDER BY COUNT(*) ASC))[OFFSET(0)] least_freq_price_offset
FROM (
SELECT
length,
amps,
ARRAY_AGG(price) price_array
FROM t
GROUP BY 1,2
)

Bigquery- Struct format

WITH yourTable AS (
SELECT 1 AS id, '2013,1625,1297,7634' AS string_col UNION ALL
SELECT 2, '1,2,3,4,5'
)
SELECT id,
(SELECT ARRAY_AGG(CAST(num AS INT64))
FROM UNNEST(SPLIT(string_col)) AS num
) AS num,
ARRAY(SELECT CAST(num AS INT64)
FROM UNNEST(SPLIT(string_col)) AS num
) AS num_2
FROM yourTable
This is how exactly my actual table is designed and Now I would like to multiply num*num_2 and then later sum it up. Is there a way to get this into struct format like ID, nums.num,nums.num_2 so that I can simply multiply which gives me the necessary result.
PS: I am looking for solution in the select statement above but not within "with" statement.

Ok, assuming that you really have reason to have your table the way you have (see my comment on your question) - below should work
#standardSQL
SELECT id,
(
SELECT SUM(num * num_2)
FROM (SELECT pos, num FROM UNNEST(num) num WITH OFFSET pos) a
JOIN (SELECT pos_2, num_2 FROM UNNEST(num_2) num_2 WITH OFFSET pos_2) b
ON a.pos = b.pos_2
) mul
FROM yourTable
you can test it with below
#standardSQL
WITH yourTable AS (
SELECT 1 id, [2013,1625,1297,7634] num, [2013,1625,1297,7634] num_2 UNION ALL
SELECT 2, [1,2,3,4,5], [1,2,3,4,5]
)
SELECT id,
(
SELECT SUM(num * num_2)
FROM (SELECT pos, num FROM UNNEST(num) num WITH OFFSET pos) a
JOIN (SELECT pos_2, num_2 FROM UNNEST(num_2) num_2 WITH OFFSET pos_2) b
ON a.pos = b.pos_2
) mul
FROM yourTable

Find the mode in BigQuery

The mode is the value that appears most often in a set.
I would like something like:
SELECT
t.id as t_id,
GROUP_CONCAT(t.value) as value_list,
MODE(t.value) AS value_mode
FROM dataset.table as t
GROUP BY t_id
such that, for example:
t_id value_list value_mode
1 2,2,2,3,6,6 2
How is that done?
EDIT: The value_list is just there for illustration purpose. Only need the mode

select id, value as value_list, v as value_mode
from (
select
id, value, v,
count(1) as c,
row_number() over(partition by id order by c desc) as top
from (
select id, value, split(value) as v
from dataset.table
)
group by id, value, v
)
where top = 1

I often have to find the mode of prices for respective groups (e.g. length and amps) to filter out sale prices and the like.
I typically use two methods both with creating an array and un-nesting it in order of frequency. One method I use is by a LIMIT another with an [OFFSET(0)] in case you want to get Nth values.
Both are included below:
WITH t AS (SELECT 18 AS length,
'HIGH' as amps,
99.95 price UNION ALL
SELECT 18, "HIGH", 99.95 UNION ALL
SELECT 18, "HIGH", 5.95 UNION ALL
SELECT 18, "LOW", 33.95 UNION ALL
SELECT 18, "LOW", 33.95 UNION ALL
SELECT 18, "LOW", 4.5 UNION ALL
SELECT 3, "HIGH", 77.95 UNION ALL
SELECT 3, "HIGH", 77.95 UNION ALL
SELECT 3, "HIGH", 9.99 UNION ALL
SELECT 3, "LOW", 44.95 UNION ALL
SELECT 3, "LOW", 44.95 UNION ALL
SELECT 3, "LOW", 5.65
)
SELECT
length,
amps,
-- By Limit
(SELECT x FROM UNNEST(price_array) x
GROUP BY x ORDER BY COUNT(*) DESC LIMIT 1 ) most_freq_price,
(SELECT x FROM UNNEST(price_array) x
GROUP BY x ORDER BY COUNT(*) ASC LIMIT 1 ) least_freq_price,
-- By Offset
ARRAY((SELECT x FROM UNNEST(price_array) x
GROUP BY x ORDER BY COUNT(*) DESC))[OFFSET(0)] most_freq_price_offset,
ARRAY((SELECT x FROM UNNEST(price_array) x
GROUP BY x ORDER BY COUNT(*) ASC))[OFFSET(0)] least_freq_price_offset
FROM (
SELECT
length,
amps,
ARRAY_AGG(price) price_array
FROM t
GROUP BY 1,2
)

For your example, this is how I would solve it:
SELECT x, w mode
FROM (
SELECT COUNT(*) c, w, ROW_NUMBER() OVER(ORDER BY c DESC) rn, FIRST(x) x
FROM (
SELECT SPLIT(x) w, x FROM (SELECT "2,2,2,3,6,6" x)
)
GROUP BY 2
)
WHERE rn=1
And with the GROUP_CONCAT within query:
SELECT gc, w mode
FROM (
SELECT COUNT(*) c, w, ROW_NUMBER() OVER(ORDER BY c DESC) rn, FIRST(gc) gc
FROM (
SELECT GROUP_CONCAT(w) OVER() gc, w
FROM (FLATTEN((
SELECT SPLIT(x) w, x FROM (SELECT "2,2,2,3,6,6" x)), w)
)
)
GROUP BY 2
)
WHERE rn=1
And handling partitions:
SELECT tid, gc value_list, w value_mode
FROM (
SELECT tid, COUNT(*) c, w, ROW_NUMBER() OVER(PARTITION BY tid ORDER BY c DESC) rn, FIRST(gc) gc
FROM (
SELECT tid, GROUP_CONCAT(w) OVER(PARTITION BY tid) gc, w
FROM (FLATTEN((
SELECT 1 tid, SPLIT(x) w, x FROM (SELECT "2,2,2,3,6,6" x)), w)
)
)
GROUP BY tid, w
)
WHERE rn=1

There is a direct function available now
approx_top_count()
Here is an example of its usage
https://cloud.google.com/bigquery/docs/reference/standard-sql/approximate_aggregate_functions#approx_top_count

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

How to create multiple arrays with random numbers in BigQuery - google-bigquery

Consider below select id, array( select mod(cast(10 * rand() as int64), 10) from unnest(generate_array(1, len) ) ) as random_arr from unnest([struct<id int64, len int64>(1,5), (2,10), (3,15)]) with output

Related

If Array_Agg Struct have duplicate data how we can delete it in Big Query

Distinct key with ARRAY_CONCAT with Struct<String, String>

How to get the most frequent value in Google's Bigquery

Bigquery- Struct format

Find the mode in BigQuery

Categories

Resources