How do I write a query in google bigquery to infer the data type of a column? - google-bigquery

I have a table with all string columns, but I know certain columns are numbers (or dates). Is there a built in function in BigQuery to infer the data type of individual columns? Something like select is_string(column_name) from table_name?

One idea that comes to mind is using SAFE_CAST in combination with LOGICAL_AND, e.g.:
#standardSQL
WITH T AS (
SELECT '2017-05-01' AS x, '3.14' AS y, '5' AS z UNION ALL
SELECT '2017-03-02' AS x, '1.59' AS y, '-1' AS z UNION ALL
SELECT NULL AS x, NULL AS y, NULL AS z
)
SELECT
LOGICAL_AND(x IS NULL OR SAFE_CAST(x AS DATE) IS NOT NULL) AS x_is_date,
LOGICAL_AND(y IS NULL OR SAFE_CAST(y AS FLOAT64) IS NOT NULL) AS y_is_float64,
LOGICAL_AND(z IS NULL OR SAFE_CAST(z AS TIMESTAMP) IS NOT NULL) AS z_is_timestamp
FROM T;
This results in true, true, and false (the z values are not timestamps). If you want to reuse the same expression multiple times, you can make this a little less verbose with a SQL UDF:
#standardSQL
CREATE TEMP FUNCTION IsDate(x STRING) AS (
x IS NULL OR SAFE_CAST(x AS DATE) IS NOT NULL
);
WITH T AS (
SELECT '2017-05-01' AS x, '3.14' AS y, '5' AS z UNION ALL
SELECT '2017-03-02' AS x, '1.59' AS y, '-1' AS z UNION ALL
SELECT NULL AS x, NULL AS y, NULL AS z
)
SELECT
LOGICAL_AND(IsDate(x)) AS x_is_date,
LOGICAL_AND(IsDate(y)) AS y_is_date,
LOGICAL_AND(IsDate(z)) AS z_is_date
FROM T;
This results in true, false, false, since only x has values in date format.

Related

Snowflake table and generator functions does not give expected result

I tried to create a simple SQL to track query_history usage, but got into trouble when creating my timeslots using the table and generator functions (the CTE named x below).
I got no results at all when limiting the query_history using my timeslots, so after a while I hardcoded an SQL to give the same result (the CTE named y below) and this works fine.
Why does not x work? As far as I can see x and y produce identical result?
To test the example first run the code as it is, this produces no result.
Then comment the line x as timeslots and un-comment the line y as timeslots, this will give the desired result.
with
x as (
select
dateadd('min',seq4()*10,dateadd('min',-60,current_timestamp())) f,
dateadd('min',(seq4()+1)*10,dateadd('min',-60,current_timestamp())) t
from table(generator(rowcount => 6))
),
y as (
select
dateadd('min',n*10,dateadd('min',-60,current_timestamp())) f,
dateadd('min',(n+1)*10,dateadd('min',-60,current_timestamp())) t
from (select 0 n union all select 1 n union all select 2 union all select 3
union all select 4 union all select 5)
)
--select * from x;
--select * from y;
select distinct
user_name,
timeslots.f
from snowflake.account_usage.query_history,
x as timeslots
--y as timeslots
where start_time >= timeslots.f
and start_time < timeslots.t
order by timeslots.f desc;
(I know the code is not optimal, this is only meant to illustrate the problem)
SEQ:
Returns a sequence of monotonically increasing integers, with wrap-around. Wrap-around occurs after the largest representable integer of the integer width (1, 2, 4, or 8 byte).
If a fully ordered, gap-free sequence is required, consider using the ROW_NUMBER window function.
For:
with x as (
select
dateadd('min',seq4()*10,dateadd('min',-60,current_timestamp())) f,
dateadd('min',(seq4()+1)*10,dateadd('min',-60,current_timestamp())) t
from table(generator(rowcount => 6))
)
SELECT * FROM x;
Should be:
with x as (
select
(ROW_NUMBER() OVER(ORDER BY seq4())) - 1 AS n,
dateadd('min',n*10,dateadd('min',-60,current_timestamp())) f,
dateadd('min',(n+1)*10,dateadd('min',-60,current_timestamp())) t
from table(generator(rowcount => 6))
)
SELECT * FROM x;

view key value rows as multi column single row

You have parameters x,y,z stored as key values.
You want to execute an expression z=x+y on those parameters. Expression is stored in another table.
You want to generate an SQL query as simply as possible from the expression.
How can you view those parameter values as a single row with columns (x,y,z) to enable execution of the expression ?
SELECT *
INTO #key_values
FROM
(
SELECT 'x' AS mykey, 2 AS myvalue
UNION ALL
SELECT 'y', 5
UNION ALL
SELECT 'z', 0
) a;
This screams for a PIVOT operator:
;WITH Inputs AS
(
SELECT 'x' AS mykey, 2 AS myvalue
UNION ALL
SELECT 'y', 5
UNION ALL
SELECT 'z', 0
)
SELECT
U.x,
U.y,
U.z,
Result = U.x + U.y
FROM
Inputs AS I
PIVOT (
MAX(I.myvalue) FOR I.mykey IN (x, y, z)
) AS U
Results:
x y z Result
2 5 0 7
You can build any expression you want with the pivoted columns in the SELECT.
If you want to update the z record, you will have to join back to the underlying table since after applying the PIVOT you lose access to original table.
IF OBJECT_ID('tempdb..#Input') IS NOT NULL
DROP TABLE #Input
CREATE TABLE #Input (
mykey VARCHAR(10),
myvalue INT)
INSERT INTO #Input (
mykey,
myvalue)
VALUES
('x', 2),
('y', 5),
('z', 0)
UPDATE I SET
myvalue = R.Result
FROM
#Input AS I
CROSS APPLY (
SELECT
Result = x + y
FROM
#Input AS I
PIVOT (MAX(I.myvalue) FOR I.mykey IN (x, y, z)) AS U
) AS R
WHERE
I.mykey = 'z'
Turn the 3 rows into a single 3 column row using a common table expression and update it to run the expression. So the proposed solution is an updatable cte.
WITH myvalues(x,y,z) AS (
SELECT x.myvalue, y.myvalue, z.myvalue
FROM #key_values AS x
JOIN #key_values AS y ON y.mykey='y' AND x.mykey='x'
JOIN #key_values AS z ON z.mykey='z'
)
UPDATE myvalues SET z=x+y;
SELECT myvalue FROM #key_values WHERE mykey='z';

Condition statement in a select

Here's what i wanna do
Select X, Y, if Z IS NULL THEN ( select something ) else Z
Basically I want to select the 'Z' if it's null I want to select another value, can someone please suggest a code example with a case or something that I can follow through it ?
Select X, Y, NVL(Z, showThis) as Z
will return showThis if Z is null in ORACLE
Select X, Y, ISNULL(Z, showThis) as Z
will return showThis if Z is null in SQL-SERVER
Do you want coalesce()?
Select X, Y,
coalesce(z, <something else>) as z
Choose with CASE:
select
X, Y,
case
when Z is null then (select something)
else (select something else)
end as col
from tablename

create implicit data without having to create temp/volatile/working table

This is possible:
SELECT 'Bla' AS X
why is this not possible in TeraData:
SELECT 'Bla' AS X
UNION
SELECT 'DiBla' AS X
Is there a way to achieve the above without having to create a temp/volatile/working table in TeraData?
PS:
The error is: A select for a union, intersect or minus must reference a table
If you want two columns on one row, then use:
SELECT 'Bla' AS X, 'DiBla' AS Y
If you want:
X
Bla
DiBla
Then you just do:
select 'Bla' as X
union all
select 'DiBlah' as X;
If you want:
X Y
Bla NULL
NULL DiBla
Then:
SELECT 'Bla' as X, NULL as Y
UNION ALL
SELECT NULL as X, 'DiBla' as Y
You have a attribute name mishmash in your UNION. You can not perform UNION between relations having different structure. Therefore, use
SELECT cast('Bla' as varchar(6)) AS X FROM (SELECT 1 a) t
UNION
SELECT cast('DiBla' as varchar(6)) AS X FROM (SELECT 1 a) t
the explicit casting make sure that the data types are equivalent as well as the attribute names. Another solution could be
SELECT * FROM
(
SELECT cast('Bla' as varchar(6)) AS X
) t
UNION
SELECT * FROM
(
SELECT cast('DiBla' as varchar(6)) AS X
) t

How to select columns of data in BigQuery that has all NULL values

How to select columns of data in BigQuery that has all NULL values
A B C
NULL 1 NULL
NULL NULL NULL
NULL 2 NULL
NULL 3 NULL
I want to retrieve columns A and C. Please can you help!!
Expanding on my comment on Mikhail's answer, this is what I had in mind. It doesn't require generating a query string, which could be quite long if you have a large number of columns. It compares the count of null values for each column name to the total number of rows in the table to decide if the column should be included in the result.
#standardSQL
WITH `project.dataset.table` AS (
SELECT NULL A, 1 B, NULL C UNION ALL
SELECT NULL, NULL, NULL UNION ALL
SELECT NULL, 2, NULL UNION ALL
SELECT NULL, 3, NULL
)
SELECT null_column
FROM `project.dataset.table` AS t,
UNNEST(REGEXP_EXTRACT_ALL(
TO_JSON_STRING(t),
r'\"([a-zA-Z0-9\_]+)\":null')
) AS null_column
GROUP BY null_column
HAVING COUNT(*) = (SELECT COUNT(*) FROM `project.dataset.table`);
Below is for BigQuery StandardSQL
Simple option:
#standardSQL
WITH `project.dataset.table` AS (
SELECT NULL A, 1 B, NULL C UNION ALL
SELECT NULL, NULL, NULL UNION ALL
SELECT NULL, 2, NULL UNION ALL
SELECT NULL, 3, NULL
)
SELECT COUNT(A) A, COUNT(B) B, COUNT(C) C
FROM `project.dataset.table`
it returns below where 0(zero) indicates that respective column has all NULLs
A B C
0 3 0
If this is "not enough" - below is more "sophisticated" version:
#standardSQL
WITH `project.dataset.table` AS (
SELECT NULL A, 1 B, NULL C UNION ALL
SELECT NULL, NULL, NULL UNION ALL
SELECT NULL, 2, NULL UNION ALL
SELECT NULL, 3, NULL
)
SELECT SPLIT(y, ':')[OFFSET(0)] column
FROM (
SELECT REGEXP_REPLACE(TO_JSON_STRING(t), r'[{}"]', '') x
FROM (
SELECT COUNT(A) A, COUNT(B) B, COUNT(C) C
FROM `project.dataset.table`
) t
), UNNEST(SPLIT(x)) y
WHERE CAST(SPLIT(y, ':')[OFFSET(1)] AS INT64) = 0
it returns result as below - enlisting only columns with all NULLs
column
A
C
Note: for your real table - just remove WITH block and replace project.dataset.table with your real table reference
Also, of course, use real column names
My table has round 700 columns..
Below is an example of how you can easily generate above query for any number of columns.
1. Just run below
2. Copy result - this is a generated query
3. paste generated query into new UI and run it
4. Enjoy (I hope you will) result :o)
Of course, as usually replace project.dataset.table with your real table reference
#standardSQL
SELECT
CONCAT('''
SELECT SPLIT(y, ':')[OFFSET(0)] column
FROM (
SELECT REGEXP_REPLACE(TO_JSON_STRING(t), r'[{}"]', '') x
FROM (
SELECT ''', y,
'''
FROM `project.dataset.table`
) t
), UNNEST(SPLIT(x)) y
WHERE CAST(SPLIT(y, ':')[OFFSET(1)] AS INT64) = 0
'''
)
FROM (
SELECT
STRING_AGG(CONCAT('COUNT(', x, ') ', x), ', ') y
FROM (
SELECT REGEXP_EXTRACT_ALL(REGEXP_REPLACE(TO_JSON_STRING(t), r'[{}]', ''), r'"([\w_]+)":') x
FROM `project.dataset.table` t
LIMIT 1
), UNNEST(x) x
)
Note: please pay attention to query cost - both "generation query" and final query itself will do full scan
You can generate columns list much cheaper off of table schema in any client of your choice
To test / play with it - you can use same dummy data as for initial queries in my answer