Conditionally include in fields for presto query - sql

I have a presto query which works as expected:
SELECT json_format(cast(MAP(ARRAY['random_name'],
ARRAY[
MAP(
ARRAY['a', 'b', 'c'],
ARRAY[a, b, c]
)]
) as JSON)) as metadata from a_table_with_a_b_c; // a,b,c are all ints
Now I only want to include a,b,c when they are larger than 0, how do I change the query? I can add 'CASE WHEN' but it seems I will have 'a:null' instead of not having it.

You can try this:
with rows as (
select row_number() over() as row_id, ARRAY['a', 'b', 'c'] as keys, ARRAY[a, b, c] as vals
from a_table_with_a_b_c
)
select
json_format(cast(MAP(ARRAY['random_name'],
ARRAY[
MAP(
array_agg(key), array_agg(value)
)]
) as JSON)) as metadata
from rows
cross join unnest (keys, vals) as t (key, value)
where value is not null and value > 0
group by row_id;

Related

Snowflake SQL - OBJECT_CONSTRUCT from COUNT and GROUP BY

I'm trying to summarize data in a table:
counting total rows
counting values on specific fields
getting the distinct values on specific fields
and, more importantly, I'm struggling with:
getting the count for each field nested in an object
given this data
COL1
COL2
A
0
null
1
B
null
B
null
the expected result from this query would be:
with dummy as (
select 'A' as col1, 0 as col2
union all
select null, 1
union all
select 'B', null
union all
select 'B', null
)
select
count(1) as total
,count(col1) as col1
,array_agg(distinct col1) as dist_col1
--,object_construct(???) as col1_object_count
,count(col2) as col2
,array_agg(distinct col2) as dist_col2
--,object_construct(???) as col2_object_count
from
dummy
TOTAL
COL1
DIST_COL1
COL1_OBJECT_COUNT
COL2
DIST_COL2
COL2_OBJECT_COUNT
4
3
["A", "B"]
{"A": 1, "B", 2, null: 1}
2
[0, 1]
{0: 1, 1: 1, null: 2}
I've tried several functions inside OBJECT_CONSTRUCT mixed with ARRAY_AGG, but all failed
OBJECT_CONSTRUCT can work with several columns but only given all (*), if you try a select statement inside, it will fail
another issue is that analytical functions are not easily taken by the object or array functions in Snowflake.
You could use Snowflake Scripting or Snowpark for this but here's a solution that is somewhat flexible so you can apply it to different tables and column sets.
Create test table/view:
Create or Replace View dummy as (
select 'A' as col1, 0 as col2
union all
select null, 1
union all
select 'B', null
union all
select 'B', null
);
Set session variables for table and colnames.
set tbname = 'DUMMY';
set colnames = '["COL1", "COL2"]';
Create view that generates the required table_column_summary data:
Create or replace View table_column_summary as
with
-- Create table of required column names
cn as (
select VALUE::VARCHAR CNAME
from table(flatten(input => parse_json($colnames)))
)
-- Convert rows into objects
,ro as (
select
object_construct_keep_null(*) row_object
-- using identifier on session variable to dynamically supply table/view name
from identifier($tbname) )
-- Flatten row objects into key/values
,rof as (
select
key col_name,
ifnull(value,'null')::VARCHAR col_value
from ro, lateral flatten(input => row_object), cn
-- You will only need this filter if you need a subset
-- of columns from the source table/query summarised
where col_name = cn.cname)
-- Get the column value distinct value counts
,cdv as (
select col_name,
col_value,
sum(1) col_value_count
from rof
group by 1,2
)
-- and derive required column level stats and combine with cdv
,cv as (
select
(select count(1) from dummy) total,
col_name,
object_construct('COL_COUNT', count(col_value) ,
'COL_DIST', array_agg(distinct col_value),
'COL_OBJECT_COUNT', object_agg(col_value,col_value_count)) col_values
from cdv
group by 1,2)
-- Return result
Select * from cv;
Use this final query if you want a solution that works flexibility with any table/columns provided as input...
Select total, object_agg(col_name, col_values) col_values_obj
From table_column_summary
Group by 1;
Or use this final query if you want the fixed columns output as described in your question...
Select total,
COL1[0]:COL_COUNT COL1,
COL1[0]:COL_DIST DIST_COL1,
COL1[0]:COL_OBJECT_COUNT COL1_OBJECT_COUNT,
COL2[0]:COL_COUNT COL2,
COL2[0]:COL_DIST DIST_COL2,
COL2[0]:COL_OBJECT_COUNT COL2_OBJECT_COUNT
from table_column_summary
PIVOT ( ARRAY_AGG ( col_values )
FOR col_name IN ( 'COL1', 'COL2' ) ) as pt (total, col1, col2);

Generate a JSON array of values for each row

Assuming the following CTE:
with mytable as (
select column1 as foo, column2 as bar, column3 as baz
from (values
('a', 'b', 1),
('c', 'd', 2)
) v
)
Using array_agg() ouputs an array of values:
select
array_agg(v)
from mytable v;
-- {"(a,b,1)","(c,d,2)"}
but surprisingly (to me at least), using to_json() on this array restores the field names into an object for each row
select
to_json(array_agg(v))
from mytable v;
-- [{"foo":"a","bar":"b","baz":1},{"foo":"c","bar":"d","baz":2}]
How can we make PostgreSQL output an array of arrays instead, rendering each row as an array of values?
select
something(v)
from mytable v;
-- [["a", "b", 1],["c", "d", 2]]
You can convert a row into a json, then unnest the key/value pairs and then aggregate the values back:
with mytable (foo, bar, baz) as (
values
('a', 'b', 1),
('c', 'd', 2)
)
select jsonb_agg(x.vals)
from mytable m
cross join lateral (
select jsonb_agg(value order by idx) as vals
from json_each(row_to_json(m)) with ordinality as t(key,value,idx)
) x
It's important to use json to convert the row, if the order of the column values in the array is important for you.
If you need this often, you can put this into a function.
If the order of the column values in the array isn't important, you can use a JSON path function:
select jsonb_path_query_array(to_jsonb(m), '$.keyvalue().value')
from mytable m;
Besides the answer from a_horse_with_no_name, I just found a way to achieve this, assuming column names are known:
with mytable as (
select column1 as foo, column2 as bar, column3 as baz
from (values
('a', 'b', 1),
('c', 'd', 2)
) v
)
select
to_json(array_agg(x.vals))
from (
select
json_build_array(
v.foo,
v.bar,
v.baz
) as vals
from mytable v
) x
;
-- [["a", "b", 1],["c", "d", 2]]

How to aggregate arrays element by element in BigQuery?

In BigQquery how can I aggregate arrays element by element ?
For instance if I have this table
id
array_value
1
[1, 2, 3]
2
[4, 5, 6]
3
[7, 8, 9]
I want to sum all the vector element-wise and output [1+4+7, 2+5+8, 3+6+9] = [12, 15, 18]
I can SUM float fields with SELECT SUM(float_field) FROM table but when I try to apply the SUM on an array I get
No matching signature for aggregate function SUM for argument types: ARRAY.
Supported signatures: SUM(INT64); SUM(FLOAT64); SUM(NUMERIC); SUM(BIGNUMERIC) at [1:8]
I have found ARRAY_AGG in the doc but it is not what I want: it just creates an array from values.
I think you want:
select array_agg(sum_val order by id) as res
from (
select idx, sum(val) as sum_val
from mytable t
cross join unnest(t.array_value) as val with offset as idx
group by idx
) t
I think you want:
select array_agg(sum_val)
from (select (select sum(val)
from unnest(t.array_value) val
) as sum_val
from t
) x
I think technically you simply refer to the individual values in the arrays using offset() or safe_offset() in case there might be missing values
-- example data
with temp as (
select * from unnest([
struct(1 as id, [1, 2, 3] as array_value),
(2, [4,5,6]),
(3, [7,8])
])
)
-- actual query
select
[
SUM( array_value[safe_offset(0)] ),
SUM( array_value[safe_offset(1)] ),
SUM( array_value[safe_offset(2)] )
] as result_array
from temp
I put them in a result array, but you don't have to do that. I had the last array missing one value to show that the query doesn't break. If you want it to break you should use offset() without the 'safe_'
Below is for BigQuery Standard SQL
select array_agg(val order by offset)
from (
select offset, sum(val) as val
from `project.dataset.table` t,
unnest(array_value) as val with offset
group by offset
)

Extract last N elements of an array in SQL (hive)

I have a column with arrays and I want to extract the X last elements in an array.
Example trying to extract the last two elements:
Column A
['a', 'b', 'c']
['d', 'e']
['f', 'g', 'h', 'i']
Expected output:
Column A
['b', 'c']
['d', 'e']
['h', 'i']
Best case scenario would be to do it without using a UDF
One method using reverse, explode, filtering and re-assembling array again:
with your_table as (
select stack (4,
0, array(), --empty array to check it works if no elements or less than n
1, array('a', 'b', 'c'),
2, array('d', 'e'),
3, array('f', 'g', 'h', 'i')
) as (id, col_A)
)
select s.id, collect_list(s.value) as col_A
from
(select s.id, a.value, a.pos
from your_table s
lateral view outer posexplode(split(reverse(concat_ws(',',s.col_A)),',')) a as pos, value
where a.pos between 0 and 1 --last two (use n-1 instead of 1 if you want last n)
distribute by s.id sort by a.pos desc --keep original order
)s
group by s.id
Result:
s.id col_a
0 []
1 ["b","c"]
2 ["d","e"]
3 ["h","i"]
More elegant way using brickhouse numeric_range UDF in this answer

Get first N elements from an array in BigQuery table

I have an array column and I would like to get first N elements of it (keeping an array data type). Is there a some nice way how to do it? Ideally without unnesting, ranking and array_agg back to array.
I could also do this (for getting first 2 elements):
WITH data AS
(
SELECT 1001 as id, ['a', 'b', 'c'] as array_1
UNION ALL
SELECT 1002 as id, ['d', 'e', 'f', 'g'] as array_1
UNION ALL
SELECT 1003 as id, ['h', 'i'] as array_1
)
select *,
[array_1[SAFE_OFFSET(0)], array_1[SAFE_OFFSET(1)]] as my_result
from data
But obviously this is not a nice solution as it would fail in case when some array would have only 1 element.
Here's a general solution with a UDF that you can call for any array type:
CREATE TEMP FUNCTION TopN(arr ANY TYPE, n INT64) AS (
ARRAY(SELECT x FROM UNNEST(arr) AS x WITH OFFSET off WHERE off < n ORDER BY off)
);
WITH data AS
(
SELECT 1001 as id, ['a', 'b', 'c'] as array_1
UNION ALL
SELECT 1002 as id, ['d', 'e', 'f', 'g'] as array_1
UNION ALL
SELECT 1003 as id, ['h', 'i'] as array_1
)
select *, TopN(array_1, 2) AS my_result
from data
It uses unnest and the array function, which it sounds like you didn't want to use, but it has the advantage of being general enough that you can pass any array to it.
Another option for BigQuery Standard SQL (with JS UDF)
#standardSQL
CREATE TEMP FUNCTION FirstN(arr ARRAY<STRING>, N FLOAT64)
RETURNS ARRAY<STRING> LANGUAGE js AS """
return arr.slice(0, N);
""";
SELECT *,
FirstN(array_1, 3) AS my_result
FROM data