BigQuery: Union on repreated fields with different order of fields - google-bigquery

How to make a UNION ALL work for repeated fields if the order of the fields does not match?
In the example below I try to UNION data_1_nested and data_2_nested, while the repeated field nested has two fields: id and age but in different order.
I could UNNEST and renest but this would not be very helpful if I have more then 1 nested field that I need to UNION on.
Example:
with
data_1 as (
Select 'a123' as id, 1 as age, 'a' as grade
union all
Select 'a123' as id, 3 as age,'b' as grade
union all
Select 'a123' as id, 4.5 as age,'c' as grade
)
,
data_2 as (
Select 'b456' as id, 6 as age,'e' as grade
union all
Select 'b456' as id, 5 as age,'f' as grade
union all
Select 'b456' as id, 2.5 as age,'g' as grade
)
,
data_1_nested as (
SELECT id,
array_agg(STRUCT(
age,grade
)) as nested
from data_1
group by 1
)
,
data_2_nested as (
SELECT id,
array_agg(STRUCT(
grade, age
)) as nested
from data_2
group by 1
)
SELECT * from data_1_nested
union all
SELECT * from data_2_nested

Below should work for you
select * from data_1_nested
union all
select id, array(select as struct age, grade from t.nested) from data_2_nested t
if applied to sample data from your question - output is

I modified your data a little bit to make 2 nested fields that need to be union. I also added a JS function for parsing the JSON. It is an ugly solution, but it seems to be working. Not sure if it is scalable (how many functions have to be created to covert different nested fields).
CREATE TEMP FUNCTION JsonToItems(input STRING)
RETURNS ARRAY<STRUCT<age INT64, grade STRING>>
LANGUAGE js AS """
return JSON.parse(input);
""";
with
data_1 as (
Select 'a123' as id, 1 as age, 'a' as grade
union all
Select 'a123' as id, 3 as age,'b' as grade
union all
Select 'a123' as id, 4.5 as age,'c' as grade
)
,
data_2 as (
Select 'b456' as id, 6 as age,'e' as grade
union all
Select 'b456' as id, 5 as age,'f' as grade
union all
Select 'b456' as id, 2.5 as age,'g' as grade
)
,
data_1_nested as (
SELECT id,
array_agg(STRUCT(
age,grade
)) as nested,
array_agg(STRUCT(
age,grade
)) as nested2
from data_1
group by 1
)
,
data_2_nested as (
SELECT id,
array_agg(STRUCT(
grade, age
)) as nested,
array_agg(STRUCT(
grade, age
)) as nested2
from data_2
group by 1
)
select id, JsonToItems(json), JsonToItems(json2) from (
SELECT id, TO_JSON_STRING(nested) as json, TO_JSON_STRING(nested2) as json2 from data_1_nested
union all
SELECT id, TO_JSON_STRING(nested) as json, TO_JSON_STRING(nested2) as json2 from data_2_nested
);

Related

How to SUM values from 2 separate tables that share the same column name in SQL

I have 2 tables that have the exact same columns but different data. The columns are 'name', 'gender' and 'count'. The first table is called names_2014 and the second names_2015. My goal is simply to find the top 5 most popular names amongst both these tables.
I know that to get the most popular names for one table is:
SELECT name, count
FROM names_2014
ORDER BY count DESC
LIMIT 5;
However, the closest I've gotten to my goal is:
SELECT name, count
FROM names_2014
UNION DISTINCT -- I've tried UNION ALL as well
SELECT name, SUM(count)
FROM names_2015
GROUP BY name
ORDER BY count DESC
LIMIT 5
I've tried many similar variations to this but none of them are successful. It seems that I need to combine both of the tables, and then SUM(count) and GROUP BY name but I guess I'm not combining the tables properly. Any help is much appreciated as I've spent hours on this and I feel like the solution is so close but I just can't see it. I'm new to SQL and just trying to test my limits.
You may perform the aggregation on a subquery that unions the two tables as the following:
select name, sum(count) cnt
from
(
select name, count
from names_2014
union all
select name, count
from names_2015
) T
group by name
order by cnt desc
limit 5
From your final ask it's not clear if you want to separate these top 5 by source table or not. Following is one answer that you might be looking for:
with name_2014 as (
select 'a' as name, 'm' as gender, 1 as cnt
union all
select 'b' as name, 'f' as gender, 3 as cnt
union all
select 'c' as name, 'm' as gender, 2 as cnt
),
name_2015 as (
select 'd' as name, 'f' as gender, 10 as cnt
union all
select 'b' as name, 'f' as gender, 5 as cnt
union all
select 'e' as name, 'm' as gender, 1 as cnt
)
(select 'name_2014' as src_table_name, name, sum(cnt) as total_counts from name_2014 group by name order by 3 desc limit 1)
union all
(select 'name_2015' as src_table_name, name, sum(cnt) as total_counts from name_2015 group by name order by 3 desc limit 1)
This sample query will give you top 1 names per table. (You can change limit and get top 5 from your query.)
If you do not want to know table names you can tweak the above query.
If you do not care about source tables at all and just want top 5 then:
with name_2014 as (
select 'a' as name, 'm' as gender, 1 as cnt
union all
select 'b' as name, 'f' as gender, 3 as cnt
union all
select 'c' as name, 'm' as gender, 2 as cnt
),
name_2015 as (
select 'd' as name, 'f' as gender, 10 as cnt
union all
select 'b' as name, 'f' as gender, 5 as cnt
union all
select 'e' as name, 'm' as gender, 1 as cnt
)
select name, sum(cnt) as total_count from
(select name, cnt from name_2014
union all
select name, cnt from name_2015)
group by 1 order by 2 desc limit 5

How to dedup array_agg in bigquery

I created a new table with repeating records with duplicates.
I am trying to find the most efficient way to deduplicate records as this will be run
on a table with millions of records.
If you using multiple CTE's nested does it matter what your data structure is the processing is done in memory or does it write to temp tables when there is a lot of data.
create or replace table t1.cte4 as
WITH t1 AS (
SELECT 1 as id,'eren' AS last_name UNION ALL
SELECT 1 as id,'yilmaz' AS last_name UNION ALL
SELECT 1 as id,'kaya' AS last_name UNION ALL
SELECT 1 as id,'kaya' AS last_name UNION ALL
SELECT 2 as id,'smith' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'brown' AS last_name
)
SELECT id,ARRAY_AGG(STRUCT(last_name)) AS last_name_rec
FROM t1
GROUP BY id;
I can remove duplicates as follows.
QUERY 1 How to dedup the concat_struct ?
select id,
STRING_AGG( distinct ln.last_name ,'~') as concat_string,
ARRAY_AGG(STRUCT( ln.last_name )) as concat_struct
from `t1.cte4`, unnest(last_name_rec) ln
group by id;
QUERY 1
QUERY 2 Is there a better way then this to dedup?
select distinct id,
TO_JSON_STRING(ARRAY_AGG(ln.last_name) OVER (PARTITION BY id)) json_string
from `t1.cte4`, unnest(last_name_rec) ln
group by id,
ln.last_name;
QUERY 2
How do I get it out of the table as distinct rather then using the CTE. This does not dedup.
select id, ARRAY_AGG(STRUCT( ln.last_name )) as concat_struct
from t1.cte4,
unnest(last_name_rec) ln group by id;
I can't do this.
select id, ARRAY_AGG(distinct STRUCT( ln.last_name )) as concat_struct from t1.cte4,
unnest(last_name_rec) ln group by id;
UPDATE: Decompose the struct before deduplication and then compose it back:
select id, ARRAY_AGG(STRUCT(last_name)) as concat_struct
from (
select id, ln.last_name
from cte4, unnest(last_name_rec) ln
group by id, ln.last_name
) d
group by id
(original answer based on unwanted change of table definition follows)
Just use array_agg(distinct ...):
WITH t1 AS (
SELECT 1 as id,'eren' AS last_name UNION ALL
SELECT 1 as id,'yilmaz' AS last_name UNION ALL
SELECT 1 as id,'kaya' AS last_name UNION ALL
SELECT 1 as id,'kaya' AS last_name UNION ALL
SELECT 2 as id,'smith' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'brown' AS last_name
)
SELECT id,ARRAY_AGG(distinct last_name) AS last_name_rec
FROM t1
GROUP BY id;

Count distinct letters in a string in bigquery

I have a string column in Biquery like:
select 'A'
union all (select 'ab')
union all (select 'abc')
union all (select 'aa')
union all (select 'aab')
I would like to count the number of distinct characters in every row of the column, in this case the results would be:
1
2
3
1
2
Can this be done in BigQuery? How?
How about this (assuming you don't want to differentiate between uppercase and lowercase)...
WITH data AS (select 'A' AS `val`
union all (select 'ab')
union all (select 'abc')
union all (select 'aa')
union all (select 'aab'))
SELECT `val`, 26 - LENGTH(REGEXP_REPLACE('abcdefghijklmnopqrstuvwxyz', '['||LOWER(`val`)||']', ''))
FROM `data`;
A simple approach is to use the SPLIT to convert your string to an array and UNNEST to convert the resulting array to a table. You may then use COUNT and DISTINCT to determine the number of unique characters as shown below:
with my_data AS (
select 'A' as col
union all (select 'ab')
union all (select 'abc')
union all (select 'aa')
union all (select 'aab')
)
select col, (SELECT COUNT(*) FROM (
SELECT DISTINCT element FROM UNNEST(SPLIT(col,'')) as element
)) n from my_data;
or simply
WITH my_data AS (
SELECT 'A' as col UNION ALL
SELECT 'ab' UNION ALL
SELECT 'abc' UNION ALL
SELECT 'aa' UNION ALL
SELECT 'aab'
)
SELECT
col,
(
SELECT
COUNT(DISTINCT element)
FROM
UNNEST(SPLIT(col,'')) as element
) cnt
FROM
my_data;
Like previous but using COUNT with DISTINCT
with my_data AS (
select 'A' as col
union all (select 'ab')
union all (select 'abc')
union all (select 'aa')
union all (select 'aab')
)
select col, COUNT(DISTINCT element) FROM
my_data,UNNEST(SPLIT(col,'')) as element
GROUP BY col
If the data is not quite huge, I would rather go with the user-defined functions to ease up the string manipulation across different columns
CREATE TEMP FUNCTION
get_unique_char_count(x STRING)
RETURNS INT64
LANGUAGE js AS r"""
str_split = new Set(x.split(""));
return str_split.size;
""";
WITH
result AS (
SELECT
'A' AS val
UNION ALL (
SELECT
'ab')
UNION ALL (
SELECT
'abc')
UNION ALL (
SELECT
'aa')
UNION ALL (
SELECT
'aab') )
SELECT
val,
get_unique_char_count(val) unique_char_count
FROM
result
RESULT:

unpivot query join to other tables

I have a query like below
with t as (
select ID, name, tag, tag_1, tag_2, tag_3, tag_4, location from table_one
)
select * from t
unpivot (
value for _tag_ in (tag,tag_1,tag_2,tag_3,tag_4)
)
Now, I want to join 3 other tables table1, table2, table3 to the above, I need to select other columns example col1, col2, col3 from those tables. Any idea on how to proceed with that.
I would use a lateral join in Oracle 12C+:
select u.*
from t cross apply
(select id, name, tag from dual union all
select id, name, tag_1 from dual union all
select id, name, tag_2 from dual union all
select id, name, tag_3 from dual union all
select id, name, tag_4 from dual
) u;
You can then join to u as you would anything else:
select u.*, . . .
from t cross apply
(select id, name, tag from dual union all
select id, name, tag_1 from dual union all
select id, name, tag_2 from dual union all
select id, name, tag_3 from dual union all
select id, name, tag_4 from dual
) u join
x
on u.? = x.?;
In Oracle 11, you can do something similar if you make the unpivot a subquery or CTE.

What's the best way of re-using classification rules for multiple queries within big query standard SQL?

I'm using Big Query to analyse Google Analytics data.
I need to classify visits dependent on whether they visit particular URLs that indicate they were in the booking process or purchased etc.
There is a long list of URLs that represent each step and hence it would be advantageous to include the classifications within a view and re-use with appropriate joins for whatever query requires the classification.
I have the following view that seems to do what I need:
SELECT
fullVisitorId,
visitID,
LOWER(h.page.pagePath) AS path,
CASE
WHEN
LOWER(h.page.pagePath) = '/' THEN '/'
WHEN
LOWER(h.page.pagePath) LIKE '{path-here}%' OR
.... .... ....
ELSE 'other'
END
AS path_classification,
_TABLE_SUFFIX AS date
FROM
`{project-id}.{data-id}.ga_sessions_*`, UNNEST(hits) AS h
WHERE
REGEXP_CONTAINS(_TABLE_SUFFIX, r'[0-9]{8}')
AND
h.type = 'PAGE'
I'm wondering if there's a simpler way of achieving this that doesn't require selecting from a pre-existing table as this doesn't seem necessary to define the classifications. I get the feeling that it's possible to use something more straight forward, but I'm not sure how to do it.
Does anyone know how to put these definitions into a view without querying a table within the view?
Let's consider simple example:
#standardSQL
WITH yourTable AS (
SELECT 1 AS id, '123' AS path UNION ALL
SELECT 2, '234' UNION ALL
SELECT 3, '345' UNION ALL
SELECT 4, '456'
)
SELECT
id,
path,
CASE path
WHEN '123' THEN 'a'
WHEN '234' THEN 'b'
WHEN '345' THEN 'c'
ELSE 'other'
END AS path_classification
FROM yourTable
ORDER BY id
Above can be refactored into below
#standardSQL
WITH yourTable AS (
SELECT 1 AS id, '123' AS path UNION ALL
SELECT 2, '234' UNION ALL
SELECT 3, '345' UNION ALL
SELECT 4, '456'
)
SELECT
id,
path,
IFNULL(
( SELECT rr.crule FROM UNNEST(r.rules) AS rr WHERE rr.cpath = path LIMIT 1),
( SELECT rr.crule FROM UNNEST(r.rules) AS rr WHERE rr.cpath IS NULL LIMIT 1)
) AS path_classification
FROM yourTable,
(SELECT ARRAY_AGG(STRUCT<cpath STRING, crule STRING>(path, rule)) AS rules
FROM `project.dataset.rules`) AS r
ORDER BY id
which relies on rules view that is defined as below
#standardSQL
SELECT '123' AS path, 'a' AS rule UNION ALL
SELECT '234', 'b' UNION ALL
SELECT '345', 'c' UNION ALL
SELECT NULL, 'other'
As you can see all classification rules are only in rules view!
You can play around this approach with below :
#standardSQL
WITH yourTable AS (
SELECT 1 AS id, '123' AS path UNION ALL
SELECT 2, '234' UNION ALL
SELECT 3, '345' UNION ALL
SELECT 4, '456'
),
rules AS (
SELECT '123' AS path, 'a' AS rule UNION ALL
SELECT '234', 'b' UNION ALL
SELECT '345', 'c' UNION ALL
SELECT NULL, 'other'
)
SELECT
id,
path,
IFNULL(
( SELECT rr.crule FROM UNNEST(r.rules) AS rr WHERE rr.cpath = path LIMIT 1),
( SELECT rr.crule FROM UNNEST(r.rules) AS rr WHERE rr.cpath IS NULL LIMIT 1)
) AS path_classification
FROM yourTable,
(SELECT ARRAY_AGG(STRUCT<cpath STRING, crule STRING>(path, rule)) AS rules
FROM rules) AS r
ORDER BY id
this can be further "simplified" by moving ARRAY_AGG inside view as below
#standardSQL
SELECT ARRAY_AGG(STRUCT<cpath STRING, crule STRING>(path, rule)) AS rules
FROM (
SELECT '123' AS path, 'a' AS rule UNION ALL
SELECT '234', 'b' UNION ALL
SELECT '345', 'c' UNION ALL
SELECT NULL, 'other'
)
In this case final query is as simple as below
#standardSQL
SELECT
id,
path,
IFNULL(
( SELECT rr.crule FROM UNNEST(r.rules) AS rr WHERE rr.cpath = path LIMIT 1),
( SELECT rr.crule FROM UNNEST(r.rules) AS rr WHERE rr.cpath IS NULL LIMIT 1)
) AS path_classification
FROM yourTable, rules AS r
ORDER BY id
Depends on your specific rules - above can /should be adjusted/optimized respectively - but I hope this gives you a main direction
Q in comment: does your solution enable the use of matching with the LIKE keyword or matching with regex?
Original question was - What's the … way of re-using classification rules for multiple queries within big query standard SQL?
So above examples in my initial answer just show you how to make this happen (focus on “reuse”)
How you will use it (matching with the LIKE keyword or matching with regex) is totally up to you!
See example below
Take a look at path_classification_exact_match vs path_classification_like_match vs path_classification_regex_match
#standardSQL
WITH yourTable AS (
SELECT 1 AS id, '123' AS path UNION ALL
SELECT 2, '234' UNION ALL
SELECT 3, '345' UNION ALL
SELECT 4, '456' UNION ALL
SELECT 5, '234abc' UNION ALL
SELECT 6, '345bcd' UNION ALL
SELECT 7, '456cde'
),
rules AS (
SELECT ARRAY_AGG(STRUCT<cpath STRING, crule STRING>(path, rule)) AS rules
FROM (
SELECT '123' AS path, 'a' AS rule UNION ALL
SELECT '234', 'b' UNION ALL
SELECT '345', 'c' UNION ALL
SELECT NULL, 'other'
)
)
SELECT
id,
path,
IFNULL(
( SELECT rr.crule FROM UNNEST(r.rules) AS rr WHERE rr.cpath = path LIMIT 1),
( SELECT rr.crule FROM UNNEST(r.rules) AS rr WHERE rr.cpath IS NULL LIMIT 1)
) AS path_classification_exact_match,
IFNULL(
( SELECT rr.crule FROM UNNEST(r.rules) AS rr WHERE path LIKE CONCAT('%',rr.cpath,'%') LIMIT 1),
( SELECT rr.crule FROM UNNEST(r.rules) AS rr WHERE rr.cpath IS NULL LIMIT 1)
) AS path_classification_like_match,
IFNULL(
( SELECT rr.crule FROM UNNEST(r.rules) AS rr WHERE REGEXP_CONTAINS(path, rr.cpath) LIMIT 1),
( SELECT rr.crule FROM UNNEST(r.rules) AS rr WHERE rr.cpath IS NULL LIMIT 1)
) AS path_classification_regex_match
FROM yourTable, rules AS r
ORDER BY id
Output is:
id path path_classification_exact_match path_classification_like_match path_classification_regex_match
1 123 a a a
2 234 b b b
3 345 c c c
4 456 other other other
5 234abc other b b
6 345bcd other c c
7 456cde other other other
Hope this helps :o)
It sounds like you may be interested in WITH clauses, which let you compose queries without having to use subqueries. For example,
#standardSQL
WITH Sales AS (
SELECT 1 AS sku, 3.14 AS price UNION ALL
SELECT 2 AS sku, 1.00 AS price UNION ALL
SELECT 3 AS sku, 9.99 AS price UNION ALL
SELECT 2 AS sku, 0.90 AS price UNION ALL
SELECT 1 AS sku, 3.56 AS price
),
ItemTotals AS (
SELECT sku, SUM(price) AS total
FROM Sales
GROUP BY sku
)
SELECT sku, total
FROM ItemTotals;
If you want to compose expressions, you can use CREATE TEMP FUNCTION statements to provide "macro-like" functionality:
#standardSQL
CREATE TEMP FUNCTION LooksLikeCheese(s STRING) AS (
LOWER(s) IN ('gouda', 'gruyere', 'havarti')
);
SELECT
s1,
LooksLikeCheese(s1) AS s1_is_cheese,
s2,
LooksLikeCheese(s2) AS s2_is_cheese
FROM (
SELECT 'spam' AS s1, 'ham' AS s2 UNION ALL
SELECT 'havarti' AS s1, 'crackers' AS s2 UNION ALL
SELECT 'gruyere' AS s1, 'ice cream' AS s2
);