Element-wise function on BigQuery array - google-bigquery

How can I apply a function element-wise on a BigQuery array. For example, how can I add a constant number to each element in the splits column or convert the floats to strings.
WITH
races_with_struct AS (
SELECT
"800M" AS race,
[STRUCT("Rudisha" AS name,
[23.4, 26.3, 26.4, 26.1] AS splits),
STRUCT("Makhloufi" AS name,
[24.5, 25.4, 26.6, 26.1] AS splits),
STRUCT("Lewandowski" AS name,
[25.0, 25.7, 26.3, 27.2] AS splits),
STRUCT("Nathan" AS name,
ARRAY<FLOAT64>[] AS splits),
STRUCT("David" AS name,
NULL AS splits)] AS participants),
races AS (
SELECT
race,
participant.name AS name,
participant.splits AS splits
FROM
races_with_struct r
CROSS JOIN
UNNEST(r.participants) AS participant)
SELECT
*
FROM
races

Okay, it was easier than I thought it would be. One can just use ARRAY in conjunction with UNNEST in the SELECT statement. Basically it looks like this
ARRAY(SELECT transform(x)
FROM UNNEST(array) AS x WITH OFFSET AS y ORDER BY y) AS transformed
So to complete the example from above
WITH
races_with_struct AS (
SELECT
"800M" AS race,
[STRUCT("Rudisha" AS name,
[23.4, 26.3, 26.4, 26.1] AS splits),
STRUCT("Makhloufi" AS name,
[24.5, 25.4, 26.6, 26.1] AS splits),
STRUCT("Lewandowski" AS name,
[25.0, 25.7, 26.3, 27.2] AS splits),
STRUCT("Nathan" AS name,
ARRAY<FLOAT64>[] AS splits),
STRUCT("David" AS name,
NULL AS splits)] AS participants),
races AS (
SELECT
race,
participant.name AS name,
participant.splits AS splits
FROM
races_with_struct r
CROSS JOIN
UNNEST(r.participants) AS participant)
SELECT
race,
name,
splits,
ARRAY(SELECT x + 2
FROM UNNEST(splits) AS x WITH OFFSET AS y ORDER BY y) AS transformed
FROM
races

Consider below approach
SELECT
*,
ARRAY(SELECT '' || split FROM t.splits split) converted_splits,
ARRAY(SELECT 5 + split FROM t.splits split) adjusted_splits
FROM
races t
with output

Related

one hot encode list in bigquery

I would like to use BigQuery instead of Pandas to create dummy variables (one-hot-encoding using multilabelbinarizer) for my categories. I have large number of columns, therefore I can't do it manually and hard code it
Test dataset (the actual one has many more variables than this one)
WITH table AS (
SELECT 1001 as ID, ['blue','green'] As Color, ['big'] AS size UNION ALL
SELECT 1002 as ID, ['green','yellow'] As Color, ['medium','large'] AS size UNION ALL
SELECT 1003 as ID, ['red'] As Color, ['big'] AS size UNION ALL
SELECT 1004 as ID, ['blue'] As Color, ['big'] AS size)
SELECT *
FROM table
EXPECTED output
I wish to store it as a table/dataframe as shown in the image. I have more columns like color,size, products, etc.
Related answer(not a list): one-hot-encoding (dummy variables) with BigQuery
Below query will return your expected output.
WITH table AS (
SELECT 1001 as ID, ['blue','green'] As Color, ['big'] AS size UNION ALL
SELECT 1002 as ID, ['green','yellow'] As Color, ['medium','large'] AS size UNION ALL
SELECT 1003 as ID, ['red'] As Color, ['big'] AS size UNION ALL
SELECT 1004 as ID, ['blue'] As Color, ['big'] AS size
)
SELECT * FROM (
SELECT ID, type, value FROM table UNPIVOT (values FOR type IN (Color, size)), UNNEST(values) value
) PIVOT (COUNT(1) FOR type || '_' || value IN (
'Color_blue', 'Color_green', 'Color_yellow', 'Color_red', 'size_big', 'size_medium', 'size_large'
));
Query results
Based on #Mikhail's answer using a dynamic sql, you can partially generalize the query. (column names are still hard-coded.)
DECLARE Colors, Sizes ARRAY<STRING>;
CREATE TEMP TABLE sample_table AS (
SELECT 1001 as ID, ['blue','green'] As Color, ['big'] AS size UNION ALL
SELECT 1002 as ID, ['green','yellow'] As Color, ['medium','large'] AS size UNION ALL
SELECT 1003 as ID, ['red'] As Color, ['big'] AS size UNION ALL
SELECT 1004 as ID, ['blue'] As Color, ['big'] AS size
);
SET (Colors, Sizes) = (
SELECT AS STRUCT ARRAY_AGG(DISTINCT IF(type = 'Color', value, NULL) IGNORE NULLS),
ARRAY_AGG(DISTINCT IF(type = 'size', value, NULL) IGNORE NULLS),
FROM `your-project.your-dataset.input_table` UNPIVOT (values FOR type IN (Color, size)), UNNEST(values) value
);
EXECUTE IMMEDIATE FORMAT("""
CREATE OR REPLACE TABLE `your-project.your-dataset.output_table` AS
SELECT * FROM (
SELECT ID, type, value FROM `your-project.your-dataset.input_table` UNPIVOT (values FOR type IN (Color, size)), UNNEST(values) value
) PIVOT (COUNT(1) FOR type || '_' || value IN (%s,%s)) ORDER BY ID;
""", (SELECT STRING_AGG(FORMAT("'Color_%s'", color)) FROM UNNEST(Colors) color),
(SELECT STRING_AGG(FORMAT("'size_%s'", size)) FROM UNNEST(Sizes) size)
);
Consider below approach - most generic I can think of and does not require any knowledge about columns number and names
create temp function extract_keys(input string) returns array<string> language js as """
return Object.keys(JSON.parse(input));""";
create temp function extract_values(input string) returns array<string> language js as """
return Object.values(JSON.parse(input));""";
create temp table flatten_list as (
select id, format('%s_%s', key, val) col from your_table t,
unnest([to_json_string((select as struct * except(id) from unnest([t])))]) json,
unnest(extract_keys(json)) key with offset
join unnest(extract_values(json)) vals with offset
using (offset), unnest(split(vals)) val
);
execute immediate format(
'create temp table pivot_table as select * from flatten_list pivot (count(*) for col in (%s)) order by id',
(select string_agg("'" || col || "'", "," order by col)
from (select distinct col from flatten_list))
);
select * from pivot_table;
if applied to sample data in your question - output is

BigQuery SQL - adding new record to existing record

I have two tables I want to create a UNION of, but they do not have the same schema.
Table 1 looks like this:
Table 1
Table 2 looks like this:
Table 2
How can I select all data for Table 2 and add a record for details.employment.locations city,regionCode, and Formatted. I want to just include null values for all of them using BigQuery SQL.
Thank you,
This should work. I have created ctes for table1 as table2 as per your schema. Since you have tables in place you can start your query from UNIONED cte
WITH TABLE1 AS
(
SELECT
'WEBSITE 1' AS WEBSITE,
STRUCT (
STRUCT ('NAME 1 ') AS NAME,
STRUCT ('AGE 1') AS AGE,
'GENDER' AS GENDER,
STRUCT(
'EMPLOYEMENT NAME 1' AS NAME,
FALSE AS _CURRENT,
'EMPLOYEMENT TITLE 1' AS TITLE,
STRUCT(
'EMPLOYEMENT LOCATION 1' AS CITY,
'EMPLOYEMENT REGION_CODE 1' AS REGION_CODE,
'EMPLOYEMENT FORMATTED 1' AS FORMATTED
) AS LOCATION
) AS EMPLOYEMENT
) AS DETAILS
)
, TABLE2 AS
(
SELECT
'WEBSITE 1' AS WEBSITE,
STRUCT (
STRUCT ('NAME 2') AS NAME,
STRUCT ('AGE 2') AS AGE,
'GENDER' AS GENDER,
STRUCT(
'EMPLOYEMENT NAME 2' AS NAME,
FALSE AS _CURRENT,
'EMPLOYEMENT TITLE 2' AS TITLE
) AS EMPLOYEMENT
) AS DETAILS
)
,UNIONED AS
(
SELECT WEBSITE,
DETAILS.NAME AS NAME,
DETAILS.AGE AS AGE,
DETAILS.GENDER AS GENDER,
DETAILS.EMPLOYEMENT.NAME AS EMPLOYEMENT_NAME,
DETAILS.EMPLOYEMENT._CURRENT AS EMPLOYEMENT_CURRENT,
DETAILS.EMPLOYEMENT.TITLE AS EMPLOYEMENT_TITLE,
DETAILS.EMPLOYEMENT.LOCATION.CITY AS EMPLOYEMENT_LOCATION_CITY,
DETAILS.EMPLOYEMENT.LOCATION.REGION_CODE AS EMPLOYEMENT_LOCATION_REGION_CODE,
DETAILS.EMPLOYEMENT.LOCATION.FORMATTED AS EMPLOYEMENT_LOCATION_FORMATTED,
FROM TABLE1
UNION ALL
SELECT WEBSITE,
DETAILS.NAME,
DETAILS.AGE,
DETAILS.GENDER,
DETAILS.EMPLOYEMENT.NAME,
DETAILS.EMPLOYEMENT._CURRENT,
DETAILS.EMPLOYEMENT.TITLE,
NULL ,
NULL,
NULL
FROM TABLE2
)
SELECT
WEBSITE,
STRUCT (
STRUCT (U.NAME ) AS NAME,
STRUCT (U.AGE) AS AGE,
U.GENDER,
STRUCT (
U.EMPLOYEMENT_NAME AS NAME,
U.EMPLOYEMENT_CURRENT AS _CURRENT,
U.EMPLOYEMENT_TITLE AS TITLE,
STRUCT (
U.EMPLOYEMENT_LOCATION_CITY AS CITY,
U.EMPLOYEMENT_LOCATION_REGION_CODE AS REGION_COD,
U.EMPLOYEMENT_LOCATION_FORMATTED AS FORMATTED
) AS LOCATION
) AS EMPLOYMENT
) AS DETAILS
FROM UNIONED AS U

How to convert the following dictionary format column into different format in Hive or Presto?

I have a Hive table as below:
event_name
attendees_per_countries
a
{'US':5}
b
{'US':4, 'UK': 3, 'CA': 2}
c
{'UK':2, 'CA': 1}
And I want to get a new table like below:
country
number_of_people
US
9
UK
5
CA
4
How to write a query in Hive or Presto?
You may use the following:
If the column type for attendees_per_countries is a string, you may use the following:
WITH sample_data AS (
select
event_name,
str_to_map(
regexp_replace(attendees_per_countries,'[{|}]',''),
',',
':'
) as attendees_per_countries
FROM
raw_data
)
select
regexp_replace(cm.key,"[' ]","") as country,
SUM(cm.value) as no_of_people
from sample_data
lateral view explode(attendees_per_countries) cm
GROUP BY regexp_replace(cm.key,"[' ]","")
ORDER BY no_of_people DESC
However, if the column type for attendees_per_countries is already a map then you may use the following
select
regexp_replace(cm.key,"[' ]","") as country,
SUM(cm.value) as no_of_people
from sample_data
lateral view explode(attendees_per_countries) cm
GROUP BY regexp_replace(cm.key,"[' ]","")
ORDER BY no_of_people DESC
Full reproducible example below
with raw_data AS (
select 'a' as event_name, "{'US':5}" as attendees_per_countries
UNION ALL
select 'b', "{'US':4, 'UK': 3, 'CA': 2}"
UNION ALL
select 'c', "{'UK':2, 'CA': 1}"
),
sample_data AS (
select
event_name,
str_to_map(
regexp_replace(attendees_per_countries,'[{}]',''),
',',
':'
) as attendees_per_countries
FROM
raw_data
)
select
regexp_replace(cm.key,"[' ]","") as country,
SUM(cm.value) as no_of_people
from sample_data
lateral view explode(attendees_per_countries) cm
GROUP BY regexp_replace(cm.key,"[' ]","")
ORDER BY no_of_people DESC
Let me know if this works for you
In presto if you have attendees_per_countries as map you can use map_values and then sum them with array_sum/reduce (I need to use later cause Athena does not support former one). If not - you can treat you data as json and cast it to MAP(VARCHAR, INTEGER) and then use the mentioned functions:
WITH dataset(event_name, attendees_per_countries) AS (
VALUES
('a', JSON '{"US":5}'),
('b', JSON '{"US":4, "UK": 3, "CA": 2}'),
('c', JSON '{"UK":2, "CA": 1}')
)
SELECT event_name as country,
reduce(
map_values(cast(attendees_per_countries as MAP(VARCHAR, INTEGER))),
0,
(agg, curr) -> agg + curr,
s -> s
) as number_of_people
FROM dataset
order by 2 desc
Output:
country
number_of_people
b
9
a
5
c
3

PRESTO MAP_FROM_ENTRIES AND CROSS JOIN UNNEST

I have two queries which I Union and then use crossjoin unnest. The main purpose is I want to get the output that is item_name, item_value in horizontal table rather than vertical
WITH base AS(
SELECT ds,MAP_FROM_ENTRIES(ARRAY [('X',COUNT_IF(X != 0)),('y',SUM(Y))]) AS metrics_map
FROM table1
UNION ALL
SELECT ds,MAP_FROM_ENTRIES( ARRAY [('A',COUNT_IF(A != 0)),('b',SUM(B))]) AS metrics_map
FROM table2
)
SELECT ds,metric_name,metric_value from base cross join unnest(metrics_map) AS t(metric_name, metric_value)
The output should be ds,metric_name,metric_value with values in metric_name as X,y,A,B but i get values only as A,B. Can anyone help me to figure this out.
You can skip MAP_FROM_ENTRIES and use default row fields names for your entries:
WITH dataset AS (
SELECT * FROM (VALUES
(1, ARRAY [('X', 1),('y',2)]),
(2, ARRAY [('A', 3),('B',4)])
) AS t (ds, metrics_array))
SELECT ds, metric.field0 as metric_name, metric.field1 as metric_value
FROM dataset
cross join unnest(metrics_array) as t(metric)
And possibly use transform to give more meaningful names:
WITH dataset AS (
SELECT * FROM (VALUES
(1, ARRAY [('X', 1), ('y',2)]),
(2, ARRAY [('A', 3),('B',4)])
) AS t (ds, metrics_array))
SELECT ds, metric.metric_name, metric.metric_value
FROM (
SELECT ds, transform(metrics_array, r -> CAST(r as ROW(metric_name VARCHAR, metric_value DOUBLE))) as metrics_array
FROM dataset
)
cross join unnest(metrics_array) as t(metric)
Output:
ds
metric_name
metric_value
1
X
1.0
1
y
2.0
2
A
3.0
2
B
4.0

Generate CASE WHEN statement using another table

I would like to create a query that does the following:
Using a regex_mapping table, find all rows in the sample data that REGEXP_MATCH on x
WITH sample_data AS (
SELECT x, y
FROM (SELECT "asd rmkt asdf" AS x, true AS y UNION ALL -- should map to remekartier
SELECT "as asdf", true UNION ALL -- should map to ali sneider
SELECT "asdafsd", false) -- should map to NULL
),
regex_mapping AS (
SELECT regex, map
FROM (SELECT "as" AS regex, "ali sneider" AS map UNION ALL
SELECT "rmkt" AS regex, "remekartier" AS map )
)
SELECT sample_data.*, mapped_item
FROM sample_data
-- but here, use multiple REGEXP_MATCH with CASE WHEN looping over the regex_mappings.
-- e.g. CASE WHEN REGEXP_MATCH(x, "as") THEN "ali sneider"
WHEN REGEXP_MATCH(x, "rmkt") THEN "remakrtier" END AS mapped_item)
Try this -
WITH sample_data AS (
SELECT x, y
FROM (SELECT "asd rmkt asdf" AS x, true AS y UNION ALL -- should map to remekartier
SELECT "as asdf", true UNION ALL -- should map to ali sneider
SELECT "asdafsd", false)
),
regex_mapping AS (
SELECT regex, map
FROM (SELECT "as" AS regex, "ali sneider" AS map UNION ALL
SELECT "rmkt" AS regex, "remekartier" AS map )
)
SELECT s.*, r.map
FROM sample_data s, regex_mapping r
WHERE regexp_contains(s.x,concat('\\b',r.regex,'\\b'))
The results ->
Second way: Instead of cross-join, use a scalar subquery. I have used limit so that the subquery doesn't return more than 1 row and if multiple regexp matches, then it will return only one of them
--- same WITH clause as above query ---
SELECT s.*, (SELECT r.map
FROM regex_mapping r
WHERE regexp_contains(s.x,concat('\\b',r.regex,'\\b'))
LIMIT 1) as map
FROM sample_data s
The results ->
Third way: Deduplicated Data
WITH sample_data AS (
SELECT campaign_name, placement_name
FROM (SELECT "as_rmkt_asdf" AS campaign_name, "xdd" AS placement_name UNION ALL -- should map to remekartier
SELECT "as_asdf", "sdfsdf" UNION ALL -- should map to ali sneider
SELECT "as_rmkt_dafsd", "sdfg" UNION ALL -- should map to rmkt
SELECT "asf_adsdf", "gdf" -- should map to NULL (because higher priority)
)
),
regex_mapping AS (
SELECT regex, map, priority
FROM (SELECT "rmkt" AS regex, "remekartier" AS map, 1 AS priority UNION ALL
SELECT "as" AS regex, "ali sneider" AS map, 2 AS priority)
),
X AS (
SELECT s.*,
CASE WHEN regexp_contains(s.campaign_name, concat('(^|_)',r.regex,'($|_)')) THEN r.map ELSE NULL END AS map,
ROW_NUMBER() OVER (PARTITION BY s.campaign_name ORDER BY regexp_contains(s.campaign_name, concat('(^|_)',r.regex,'($|_)')) DESC, r.priority) AS rn
FROM sample_data s
CROSS JOIN regex_mapping r
)
SELECT * EXCEPT (rn)
FROM X
WHERE rn = 1