unpivot query join to other tables - sql

I have a query like below
with t as (
select ID, name, tag, tag_1, tag_2, tag_3, tag_4, location from table_one
)
select * from t
unpivot (
value for _tag_ in (tag,tag_1,tag_2,tag_3,tag_4)
)
Now, I want to join 3 other tables table1, table2, table3 to the above, I need to select other columns example col1, col2, col3 from those tables. Any idea on how to proceed with that.

I would use a lateral join in Oracle 12C+:
select u.*
from t cross apply
(select id, name, tag from dual union all
select id, name, tag_1 from dual union all
select id, name, tag_2 from dual union all
select id, name, tag_3 from dual union all
select id, name, tag_4 from dual
) u;
You can then join to u as you would anything else:
select u.*, . . .
from t cross apply
(select id, name, tag from dual union all
select id, name, tag_1 from dual union all
select id, name, tag_2 from dual union all
select id, name, tag_3 from dual union all
select id, name, tag_4 from dual
) u join
x
on u.? = x.?;
In Oracle 11, you can do something similar if you make the unpivot a subquery or CTE.

Related

Count distinct letters in a string in bigquery

I have a string column in Biquery like:
select 'A'
union all (select 'ab')
union all (select 'abc')
union all (select 'aa')
union all (select 'aab')
I would like to count the number of distinct characters in every row of the column, in this case the results would be:
1
2
3
1
2
Can this be done in BigQuery? How?
How about this (assuming you don't want to differentiate between uppercase and lowercase)...
WITH data AS (select 'A' AS `val`
union all (select 'ab')
union all (select 'abc')
union all (select 'aa')
union all (select 'aab'))
SELECT `val`, 26 - LENGTH(REGEXP_REPLACE('abcdefghijklmnopqrstuvwxyz', '['||LOWER(`val`)||']', ''))
FROM `data`;
A simple approach is to use the SPLIT to convert your string to an array and UNNEST to convert the resulting array to a table. You may then use COUNT and DISTINCT to determine the number of unique characters as shown below:
with my_data AS (
select 'A' as col
union all (select 'ab')
union all (select 'abc')
union all (select 'aa')
union all (select 'aab')
)
select col, (SELECT COUNT(*) FROM (
SELECT DISTINCT element FROM UNNEST(SPLIT(col,'')) as element
)) n from my_data;
or simply
WITH my_data AS (
SELECT 'A' as col UNION ALL
SELECT 'ab' UNION ALL
SELECT 'abc' UNION ALL
SELECT 'aa' UNION ALL
SELECT 'aab'
)
SELECT
col,
(
SELECT
COUNT(DISTINCT element)
FROM
UNNEST(SPLIT(col,'')) as element
) cnt
FROM
my_data;
Like previous but using COUNT with DISTINCT
with my_data AS (
select 'A' as col
union all (select 'ab')
union all (select 'abc')
union all (select 'aa')
union all (select 'aab')
)
select col, COUNT(DISTINCT element) FROM
my_data,UNNEST(SPLIT(col,'')) as element
GROUP BY col
If the data is not quite huge, I would rather go with the user-defined functions to ease up the string manipulation across different columns
CREATE TEMP FUNCTION
get_unique_char_count(x STRING)
RETURNS INT64
LANGUAGE js AS r"""
str_split = new Set(x.split(""));
return str_split.size;
""";
WITH
result AS (
SELECT
'A' AS val
UNION ALL (
SELECT
'ab')
UNION ALL (
SELECT
'abc')
UNION ALL (
SELECT
'aa')
UNION ALL (
SELECT
'aab') )
SELECT
val,
get_unique_char_count(val) unique_char_count
FROM
result
RESULT:

How filter rows by matched values using BigQuery?

I have a table in BigQuery
SELECT 1 as big_id, 1 as temp_id, '101' as names
UNION ALL SELECT 1,1, 'z3Awwer',
UNION ALL SELECT 1,1, 'gA1sd03',
UNION ALL SELECT 1,2, 'z3Awwer',
UNION ALL SELECT 1,2, 'gA1sd03',
UNION ALL SELECT 1,3, 'gA1sd03',
UNION ALL SELECT 1,3, 'sAs10sdf4',
UNION ALL SELECT 1,4, 'sAs10sdf4',
UNION ALL SELECT 1,5, 'Adf105',
UNION ALL SELECT 2,1, 'A1sdf02',
UNION ALL SELECT 2,1, '345A103',
UNION ALL SELECT 2,2, '345A103',
UNION ALL SELECT 2,2, 'A1sd04',
UNION ALL SELECT 2,3, 'A1sd04',
UNION ALL SELECT 2,4, '6_0Awe105'
I want to filter it by temp_id if all names of one temp_id included in some another temp_id in partition by big_id window. For example I do not need to select all rows where temp_id = 2 because all names of temp_id = 2 included in temp_id = 1. As well as need to keep all rows of temp_id = 1 because this names range covers names range of temp_id = 2
So expected output:
SELECT 1 as big_id, 1 as temp_id, '101' as names
UNION ALL SELECT 1,1, 'z3Awwer',
UNION ALL SELECT 1,1, 'gA1sd03',
UNION ALL SELECT 1,3, 'gA1sd03',
UNION ALL SELECT 1,3, 'sAs10sdf4',
UNION ALL SELECT 1,5, 'Adf105',
UNION ALL SELECT 2,1, 'A1sdf02',
UNION ALL SELECT 2,1, '345A103',
UNION ALL SELECT 2,2, '345A103',
UNION ALL SELECT 2,2, 'A1sd04',
UNION ALL SELECT 2,4, '6_0Awe105'
How can I make it using BigQuery?
Below is for BigQuery Standard SQL
#standardsql
with temp as (
select big_id, temp_id, array_agg(names) names
from `project.dataset.table`
group by big_id, temp_id
)
select big_id, temp_id, names
from (
select big_id, temp_id, any_value(names) names
from (
select t1.*,
( select count(1)
from t1.names name
join t2.names name
using(name)
where t1.temp_id != t2.temp_id
) = array_length(t1.names) as flag
from temp t1
join temp t2
using (big_id)
)
group by big_id, temp_id
having countif(flag) = 0
), unnest(names) names
If to apply above to sample data from your question - the output is

SQL assigning incremental ID to subgroups

As the title says, I'm trying to add an extra column to a table which autoincrements everytime a different string in another column changes.
I would like to do this in a query.
Example:
MyCol GroupID
Cable 1
Cable 1
Foo 2
Foo 2
Foo 2
Fuzz 3
Fizz 4
Tv 5
Tv 5
The GroupID column is what I want to accomplish.
We can be sure that MyCol's strings will be the same in each subgroup (Foo will always be Foo, etc).
Thanks in advance
If I understand correctly, you can use dense_rank():
select t.*, dense_rank() over (order by col1) as groupid
from t;
You could make a temporal table with the distinct value of the MyCol and get the groupId throught the RowNumber of the temp table, and join the rownumbered result with your table.
This is a raw example in oracle:
WITH data AS
(SELECT 'Cable' MyCol FROM dual
UNION ALL
SELECT 'Cable' FROM dual
UNION ALL
SELECT 'Foo' FROM dual
UNION ALL
SELECT 'Foo' FROM dual
UNION ALL
SELECT 'Foo' FROM dual
UNION ALL
SELECT 'Fuzz' FROM dual
UNION ALL
SELECT 'Fizz' FROM dual
UNION ALL
SELECT 'Tv' FROM dual
UNION ALL
SELECT 'Tv' FROM dual
),
tablename AS
(SELECT * FROM data
),
temp AS
( SELECT DISTINCT mycol FROM tablename
),
temp2 AS
( SELECT mycol, rownum AS groupid from temp
)
SELECT tablename.mycol, temp2.groupid FROM temp2 JOIN tablename ON temp2.mycol = tablename.mycol
You could also check for a way to implement the tabibitosan method knowing that your column condition is string.

What's the best way of re-using classification rules for multiple queries within big query standard SQL?

I'm using Big Query to analyse Google Analytics data.
I need to classify visits dependent on whether they visit particular URLs that indicate they were in the booking process or purchased etc.
There is a long list of URLs that represent each step and hence it would be advantageous to include the classifications within a view and re-use with appropriate joins for whatever query requires the classification.
I have the following view that seems to do what I need:
SELECT
fullVisitorId,
visitID,
LOWER(h.page.pagePath) AS path,
CASE
WHEN
LOWER(h.page.pagePath) = '/' THEN '/'
WHEN
LOWER(h.page.pagePath) LIKE '{path-here}%' OR
.... .... ....
ELSE 'other'
END
AS path_classification,
_TABLE_SUFFIX AS date
FROM
`{project-id}.{data-id}.ga_sessions_*`, UNNEST(hits) AS h
WHERE
REGEXP_CONTAINS(_TABLE_SUFFIX, r'[0-9]{8}')
AND
h.type = 'PAGE'
I'm wondering if there's a simpler way of achieving this that doesn't require selecting from a pre-existing table as this doesn't seem necessary to define the classifications. I get the feeling that it's possible to use something more straight forward, but I'm not sure how to do it.
Does anyone know how to put these definitions into a view without querying a table within the view?
Let's consider simple example:
#standardSQL
WITH yourTable AS (
SELECT 1 AS id, '123' AS path UNION ALL
SELECT 2, '234' UNION ALL
SELECT 3, '345' UNION ALL
SELECT 4, '456'
)
SELECT
id,
path,
CASE path
WHEN '123' THEN 'a'
WHEN '234' THEN 'b'
WHEN '345' THEN 'c'
ELSE 'other'
END AS path_classification
FROM yourTable
ORDER BY id
Above can be refactored into below
#standardSQL
WITH yourTable AS (
SELECT 1 AS id, '123' AS path UNION ALL
SELECT 2, '234' UNION ALL
SELECT 3, '345' UNION ALL
SELECT 4, '456'
)
SELECT
id,
path,
IFNULL(
( SELECT rr.crule FROM UNNEST(r.rules) AS rr WHERE rr.cpath = path LIMIT 1),
( SELECT rr.crule FROM UNNEST(r.rules) AS rr WHERE rr.cpath IS NULL LIMIT 1)
) AS path_classification
FROM yourTable,
(SELECT ARRAY_AGG(STRUCT<cpath STRING, crule STRING>(path, rule)) AS rules
FROM `project.dataset.rules`) AS r
ORDER BY id
which relies on rules view that is defined as below
#standardSQL
SELECT '123' AS path, 'a' AS rule UNION ALL
SELECT '234', 'b' UNION ALL
SELECT '345', 'c' UNION ALL
SELECT NULL, 'other'
As you can see all classification rules are only in rules view!
You can play around this approach with below :
#standardSQL
WITH yourTable AS (
SELECT 1 AS id, '123' AS path UNION ALL
SELECT 2, '234' UNION ALL
SELECT 3, '345' UNION ALL
SELECT 4, '456'
),
rules AS (
SELECT '123' AS path, 'a' AS rule UNION ALL
SELECT '234', 'b' UNION ALL
SELECT '345', 'c' UNION ALL
SELECT NULL, 'other'
)
SELECT
id,
path,
IFNULL(
( SELECT rr.crule FROM UNNEST(r.rules) AS rr WHERE rr.cpath = path LIMIT 1),
( SELECT rr.crule FROM UNNEST(r.rules) AS rr WHERE rr.cpath IS NULL LIMIT 1)
) AS path_classification
FROM yourTable,
(SELECT ARRAY_AGG(STRUCT<cpath STRING, crule STRING>(path, rule)) AS rules
FROM rules) AS r
ORDER BY id
this can be further "simplified" by moving ARRAY_AGG inside view as below
#standardSQL
SELECT ARRAY_AGG(STRUCT<cpath STRING, crule STRING>(path, rule)) AS rules
FROM (
SELECT '123' AS path, 'a' AS rule UNION ALL
SELECT '234', 'b' UNION ALL
SELECT '345', 'c' UNION ALL
SELECT NULL, 'other'
)
In this case final query is as simple as below
#standardSQL
SELECT
id,
path,
IFNULL(
( SELECT rr.crule FROM UNNEST(r.rules) AS rr WHERE rr.cpath = path LIMIT 1),
( SELECT rr.crule FROM UNNEST(r.rules) AS rr WHERE rr.cpath IS NULL LIMIT 1)
) AS path_classification
FROM yourTable, rules AS r
ORDER BY id
Depends on your specific rules - above can /should be adjusted/optimized respectively - but I hope this gives you a main direction
Q in comment: does your solution enable the use of matching with the LIKE keyword or matching with regex?
Original question was - What's the … way of re-using classification rules for multiple queries within big query standard SQL?
So above examples in my initial answer just show you how to make this happen (focus on “reuse”)
How you will use it (matching with the LIKE keyword or matching with regex) is totally up to you!
See example below
Take a look at path_classification_exact_match vs path_classification_like_match vs path_classification_regex_match
#standardSQL
WITH yourTable AS (
SELECT 1 AS id, '123' AS path UNION ALL
SELECT 2, '234' UNION ALL
SELECT 3, '345' UNION ALL
SELECT 4, '456' UNION ALL
SELECT 5, '234abc' UNION ALL
SELECT 6, '345bcd' UNION ALL
SELECT 7, '456cde'
),
rules AS (
SELECT ARRAY_AGG(STRUCT<cpath STRING, crule STRING>(path, rule)) AS rules
FROM (
SELECT '123' AS path, 'a' AS rule UNION ALL
SELECT '234', 'b' UNION ALL
SELECT '345', 'c' UNION ALL
SELECT NULL, 'other'
)
)
SELECT
id,
path,
IFNULL(
( SELECT rr.crule FROM UNNEST(r.rules) AS rr WHERE rr.cpath = path LIMIT 1),
( SELECT rr.crule FROM UNNEST(r.rules) AS rr WHERE rr.cpath IS NULL LIMIT 1)
) AS path_classification_exact_match,
IFNULL(
( SELECT rr.crule FROM UNNEST(r.rules) AS rr WHERE path LIKE CONCAT('%',rr.cpath,'%') LIMIT 1),
( SELECT rr.crule FROM UNNEST(r.rules) AS rr WHERE rr.cpath IS NULL LIMIT 1)
) AS path_classification_like_match,
IFNULL(
( SELECT rr.crule FROM UNNEST(r.rules) AS rr WHERE REGEXP_CONTAINS(path, rr.cpath) LIMIT 1),
( SELECT rr.crule FROM UNNEST(r.rules) AS rr WHERE rr.cpath IS NULL LIMIT 1)
) AS path_classification_regex_match
FROM yourTable, rules AS r
ORDER BY id
Output is:
id path path_classification_exact_match path_classification_like_match path_classification_regex_match
1 123 a a a
2 234 b b b
3 345 c c c
4 456 other other other
5 234abc other b b
6 345bcd other c c
7 456cde other other other
Hope this helps :o)
It sounds like you may be interested in WITH clauses, which let you compose queries without having to use subqueries. For example,
#standardSQL
WITH Sales AS (
SELECT 1 AS sku, 3.14 AS price UNION ALL
SELECT 2 AS sku, 1.00 AS price UNION ALL
SELECT 3 AS sku, 9.99 AS price UNION ALL
SELECT 2 AS sku, 0.90 AS price UNION ALL
SELECT 1 AS sku, 3.56 AS price
),
ItemTotals AS (
SELECT sku, SUM(price) AS total
FROM Sales
GROUP BY sku
)
SELECT sku, total
FROM ItemTotals;
If you want to compose expressions, you can use CREATE TEMP FUNCTION statements to provide "macro-like" functionality:
#standardSQL
CREATE TEMP FUNCTION LooksLikeCheese(s STRING) AS (
LOWER(s) IN ('gouda', 'gruyere', 'havarti')
);
SELECT
s1,
LooksLikeCheese(s1) AS s1_is_cheese,
s2,
LooksLikeCheese(s2) AS s2_is_cheese
FROM (
SELECT 'spam' AS s1, 'ham' AS s2 UNION ALL
SELECT 'havarti' AS s1, 'crackers' AS s2 UNION ALL
SELECT 'gruyere' AS s1, 'ice cream' AS s2
);

Get Var() and AVG() from many columns

I'm building a query to show average and variance from many columns.
To get the average I use this:
SELECT *,
(SELECT AVG(t.c)
FROM (
SELECT peca_1 UNION ALL
SELECT peca_2 UNION ALL
SELECT peca_3 UNION ALL
SELECT peca_4 UNION ALL
SELECT peca_5 UNION ALL
SELECT peca_6 UNION ALL
SELECT peca_7 UNION ALL
SELECT peca_8 UNION ALL
SELECT peca_9 UNION ALL
SELECT peca_10
) t(c)
) as [media]
from Durabilidade
where cd_durabilidade = 1
The result is:
Now I need a new column with VAR(media) comparing each row with first row.
Any idea?
I think this is a case where cross apply is appropriate. I am assuming that you want the variance of the values as calculated by the var() function:
SELECT *, t.avgval as [media], t.varval
from Durabilidade d cross apply
(select avg(t.val) as avgval, var(t.val) as varval
from (select d.peca_1 union all
select d.peca_2 union all
select d.peca_3 union all
select d.peca_4 union all
select d.peca_5 union all
select d.peca_6 union all
select d.peca_7 union all
select d.peca_8 union all
select d.peca_9 union all
select d.peca_10
) t(val) -- t(val) to work
) t
where cd_durabilidade = 1
Something like this?
SELECT *,
VAR(media) AS [variance]
FROM
(
SELECT *,
(SELECT AVG(t.c)
FROM (
SELECT peca_1 UNION ALL
SELECT peca_2 UNION ALL
SELECT peca_3 UNION ALL
SELECT peca_4 UNION ALL
SELECT peca_5 UNION ALL
SELECT peca_6 UNION ALL
SELECT peca_7 UNION ALL
SELECT peca_8 UNION ALL
SELECT peca_9 UNION ALL
SELECT peca_10
) t(c)
) as [media]
from Durabilidade
where cd_durabilidade = 1
) x
GROUP BY
column1_from_durabilidade
,column2_from_durabilidade
--etc
,media