Parsing string in a hive table - sql

I have a hive table which has two columns (day, type_of_day) both of type string
"monday" [{"temp" : 45, "weather": "rainny"}, {"temp" : 25, "weather": "sunny"}, {"temp" : 15, "weather": "storm"}]
"tuesday" [{"temp" : 5, "weather": "winter"}, {"temp" : 10, "weather": "sun"}, {"temp" : 18, "weather": "dawn"}]
I wanna split ( I guess explode is the technical term) and then just get a list of weather for each day. I'm familiar with how to do this in python but is there a way to directly do this in hive.
"monday" [45, 25, 15]
"tuesday" [5, 10, 18]

Testing with your data example. Replace CTE with your table. Read comments in the code:
with your_table as (--use your table instead of this CTE
select stack(2,
"monday",'[{"temp" : 45, "weather": "rainny"}, {"temp" : 25, "weather": "sunny"}, {"temp" : 15, "weather": "storm"}]',
"tuesday" ,'[{"temp" : 5, "weather": "winter"}, {"temp" : 10, "weather": "sun"}, {"temp" : 18, "weather": "dawn"}]'
)as (day, type_of_day)
) --use your table instead of this CTE
select s.day, array(get_json_object(type_of_day_array[0],'$.temp'),
get_json_object(type_of_day_array[1],'$.temp'),
get_json_object(type_of_day_array[2],'$.temp')
) as result_array --extract JSON elements and construct array
from
(
select day, split(regexp_replace(regexp_replace(type_of_day,'\\[|\\]',''), --remove square brackets
'\\}, *\\{','\\}##\\{'), --make convenient split separator
'##') --split
as type_of_day_array
from your_table --use your table instead of this CTE
)s;
Result:
s.day result_array
monday ["45","25","15"]
tuesday ["5","10","18"]
If the array of JSON can contain more than three elements, then you can use lateral view explode or posexplode and then build the resulting array like in this answer: https://stackoverflow.com/a/51570035/2700344.
Wrap array elements in cast(... as int) if you need array<int> as a result instead of array<string>:
cast(get_json_object(type_of_day[0],'$.temp') as int)...

Related

Given two arrays in Snowflake, find summation of minimum elements

Given two arrays like a = [10, 20, 30], and b = [9, 21, 32], how can I construct an array that consists of the minimum or maximum element based on index in snowflake, i.e. the desired output for minimum is [9,20,30] and for the maximum is [10,21,32]?
I looked at snowflake's array functions and didn't find a function that does this.
If the arrays are always the same size (and reusing Lukasz great data cte):
WITH cte AS (
SELECT ARRAY_CONSTRUCT(10, 20, 30) AS a, ARRAY_CONSTRUCT(9, 21, 32) AS b
)
SELECT a,b
,ARRAY_AGG(LEAST(a[n.index], b[n.index])) WITHIN GROUP(ORDER BY n.index) AS min_array
,ARRAY_AGG(GREATEST(a[n.index], b[n.index])) WITHIN GROUP(ORDER BY n.index) AS max_array
FROM cte
,table(flatten(a)) n
GROUP BY 1,2;
gives:
A
B
MIN_ARRAY
MAX_ARRAY
[ 10, 20, 30 ]
[ 9, 21, 32 ]
[ 9, 20, 30 ]
[ 10, 21, 32 ]
And if you have uneven lists:
WITH cte AS (
SELECT ARRAY_CONSTRUCT(10, 20, 30) AS a, ARRAY_CONSTRUCT(9, 21, 32) AS b
union all
SELECT ARRAY_CONSTRUCT(10, 20, 30) AS a, ARRAY_CONSTRUCT(9, 21, 32, 45) AS b
)
SELECT a,b
,ARRAY_AGG(LEAST(a[n.index], b[n.index])) WITHIN GROUP(ORDER BY n.index) AS min_array
,ARRAY_AGG(GREATEST(a[n.index], b[n.index])) WITHIN GROUP(ORDER BY n.index) AS max_array
FROM cte
,table(flatten(iff(array_size(a)>=array_size(b), a, b))) n
GROUP BY 1,2;
A
B
MIN_ARRAY
MAX_ARRAY
[ 10, 20, 30 ]
[ 9, 21, 32 ]
[ 9, 20, 30 ]
[ 10, 21, 32 ]
[ 10, 20, 30 ]
[ 9, 21, 32, 45 ]
[ 9, 20, 30 ]
[ 10, 21, 32 ]
will pick the largest, but given the NULL from the smaller list will cause LEAST/GREATEST to return NULL and ARRAY_AGG drops nulls, you don't even need to size compare, unless you want to NVL/COALESCE that values to safe values for nulls.
SELECT 1 as a, null as b, least(a,b);
gives:
A
B
LEAST(A,B)
1
null
null
like so:
SELECT a,b
,ARRAY_AGG(LEAST(nvl(a[n.index],10000), nvl(b[n.index],10000))) WITHIN GROUP(ORDER BY n.index) AS min_array
,ARRAY_AGG(GREATEST(nvl(a[n.index],0), nvl(b[n.index],0))) WITHIN GROUP(ORDER BY n.index) AS max_array
FROM cte
,table(flatten(iff(array_size(a)>=array_size(b), a, b))) n
GROUP BY 1,2;
A
B
MIN_ARRAY
MAX_ARRAY
[ 10, 20, 30 ]
[ 9, 21, 32 ]
[ 9, 20, 30 ]
[ 10, 21, 32 ]
[ 10, 20, 30 ]
[ 9, 21, 32, 45 ]
[ 9, 20, 30, 45 ]
[ 10, 21, 32, 45 ]
Using numbers table/[] to access elements and ARRAY_AGG to build new arrays:
WITH cte AS (
SELECT ARRAY_CONSTRUCT(10, 20, 30) AS a, ARRAY_CONSTRUCT(9, 21, 32) AS b
), numbers AS (
SELECT ROW_NUMBER() OVER(ORDER BY seq4())-1 AS IND
FROM TABLE(GENERATOR(ROWCOUNT => 10001))
)
SELECT a,b
,ARRAY_AGG(LEAST(a[ind], b[ind])) WITHIN GROUP(ORDER BY n.ind) AS min_array
,ARRAY_AGG(GREATEST(a[ind], b[ind])) WITHIN GROUP(ORDER BY n.ind) AS max_array
FROM cte
JOIN numbers n
ON n.ind < GREATEST(ARRAY_SIZE(a), ARRAY_SIZE(b))
GROUP BY a,b;
Output:

Pivoting data into JSON with numbered fields

In SQL Server 2017, I have a table as follows:
CREATE TABLE #Data
(
Code VARCHAR (2)
, RegionCode VARCHAR (10)
, Prop INT
, Val VARCHAR (200)
, PropF VARCHAR (50)
, PropFD VARCHAR (200)
)
INSERT INTO #Data
(
Code, RegionCode, Prop, Val, PropF, PropFD
)
VALUES
('AD', 'DLSO324', 1, 'Abcdefg', 'SD', 'SomeDescription')
, ('AD', 'DLSO324', 2, 'sdfadf', 'SA', 'SomethingA')
, ('AD', 'DLSO324', 3, 'gfdsdfg', 'SB', 'SomethingB')
, ('AD', 'DLSO324', 4, 'r43df', 'SC', 'SomethingC')
, ('AD', 'DLSO324', 5, 'GHD-123', 'SD2', 'SomethingD')
, ('AD', 'DLSO324', 6, '2013-03-42', 'SE', 'SomethingE')
, ('AD', 'XR1046', 34, 'Value1', 'dsf', 'Desc1')
, ('AD', 'XR1046', 65, 'Value1', 'gfsd', 'Desc1')
, ('AD', 'XR1046', 23, 'Value1', 'dg', 'Desc1')
, ('AD', 'XR1046', 67, 'Value1', 'fgh', 'Desc1')
, ('AD', 'XR1046', 45, 'Value1', 'fh', 'Desc1')
, ('AD', 'XR1046', 99, 'Value1', 'hfgfgh', 'Desc1')
SELECT *
FROM #Data
where you'll notice that a code and region code has multiple props with each prop having a value (val), a property code (propF), and a property field description (PropFD). The number of properties a Code and RegionCode combination can have varies from anywhere between 1 and 100 and different combinations of Code and RegionCode can have different PropF and PropFD values even if they share the same prop number.
What I need to do is write a query that pivots the data and produces one row per Code and RegionCode with some JSON data. I need to completely flatten out the JSON so that each Prop number has its own Val, PropF, and PropFD field. My desired structure is as follow (you'll notice that the _number corresponds to the prop value in the #Data table):
[
{
"Val_1": "Abcdefg",
"PropF_1": "SD",
"PropFD_1": "SomeDescription",
"Val_2": "sdfadf",
"PropF_2": "SA",
"PropFD_2": "SomethingA",
"Val_3": "gfdsdfg",
"PropF_3": "SB",
"PropFD_3": "SomethingB",
"Val_4": "r43df",
"PropF_4": "SC",
"PropFD_4": "SomethingC",
"Val_5": "GHD-123",
"PropF_5": "SD2",
"PropFD_5": "SomethingD",
"Val_6": "2013-03-42",
"PropF_6": "SE",
"PropFD_6": "SomethingE"
}
]
So far I have the following query:
SELECT x.Code
, x.RegionCode
, ( SELECT y.Prop id
, y.Val
, y.PropF
, y.PropFD
FROM #Data y
WHERE y.Code = x.Code
AND y.RegionCode = x.RegionCode
FOR JSON PATH) FieldData
FROM #Data x
GROUP BY x.Code
, x.RegionCode
Is there a way for me to get my desired structure using JOINs and the SQL Server 2017 JSON functions? I want to avoid using PIVOT if possible due to performance reasons.
Since SQL Server is declarative by design, your desired results would require either Dynamic SQL or some String Manipulation.
The following demonstrates a little string manipulation in concert with string_agg()
Example
SELECT Code
,RegionCode
,FieldData = '[{'+string_agg(concat('"Val_',prop,'":"',Val,'","PropF_',Prop,'":"',PropF,'","PropFD_',Prop,'":"',PropFD,'"'),',')+'}]'
FROM #Data
Group By Code,RegionCode
Results
Results First Record's JSON
[
{
"Val_1": "Abcdefg",
"PropF_1": "SD",
"PropFD_1": "SomeDescription",
"Val_2": "sdfadf",
"PropF_2": "SA",
"PropFD_2": "SomethingA",
"Val_3": "gfdsdfg",
"PropF_3": "SB",
"PropFD_3": "SomethingB",
"Val_4": "r43df",
"PropF_4": "SC",
"PropFD_4": "SomethingC",
"Val_5": "GHD-123",
"PropF_5": "SD2",
"PropFD_5": "SomethingD",
"Val_6": "2013-03-42",
"PropF_6": "SE",
"PropFD_6": "SomethingE"
}
]
The Second Record's JSON
[
{
"Val_34": "Value1",
"PropF_34": "dsf",
"PropFD_34": "Desc1",
"Val_65": "Value1",
"PropF_65": "gfsd",
"PropFD_65": "Desc1",
"Val_23": "Value1",
"PropF_23": "dg",
"PropFD_23": "Desc1",
"Val_67": "Value1",
"PropF_67": "fgh",
"PropFD_67": "Desc1",
"Val_45": "Value1",
"PropF_45": "fh",
"PropFD_45": "Desc1",
"Val_99": "Value1",
"PropF_99": "hfgfgh",
"PropFD_99": "Desc1"
}
]

How to count array elements occurrences in Presto?

I have an array in Presto and I'd like to count how many times each element occurs in it. For example, I have
[a, a, a, b, b]
and I'd like to get something like
{a: 3, b: 2}
We do not have a direct function for this, but you can combine UNNEST with histogram:
presto> SELECT histogram(x)
-> FROM UNNEST(ARRAY[1111, 1111, 22, 22, 1111]) t(x);
_col0
----------------
{22=2, 1111=3}
You may want to file a new issue for a direct function for this.
SELECT
TRANSFORM_VALUES(
MULTIMAP_FROM_ENTRIES(
TRANSFORM(ARRAY['a', 'a', 'a', 'b', 'b'], x -> ROW(x, 1))
),
(k, v) -> ARRAY_SUM(v)
)
Output:
{
"a": 3,
"b": 2
}
You can use REDUCE if there is no support of ARRAY_SUM:
SELECT
TRANSFORM_VALUES(
MULTIMAP_FROM_ENTRIES(
TRANSFORM(ARRAY['a', 'a', 'a', 'b', 'b'], x -> ROW(x, 1))
),
(k, v) -> REDUCE(v, 0, (s, x) -> s + x, s -> s)
)
In Presto 0.279, you now have a direct function for this purpose. You can easily use array_frequency. The input is your ARRAY, and the output is a MAP, where keys are the element of the given array and values are the frequencies. Fro example, if you run this SQL :
SELECT array_frequency(ARRAY[1,4,1,3,5,4,7,3,1])
The result will be
{
"1": 3,
"3": 2,
"4": 2,
"5": 1,
"7": 1
}

How to extract separate values from GeoJSON in BigQuery

I have a GeoJSON string for a multipoint geometry. I want to extract each of those points to a table of individual point geometries in BigQuery
I have been able to achieve point geometry for one of the points. I want to do it for all the others as well in a automated fashion. I've already tried converting the string to an array but it remains an array of size 1 with the entire content as a single string.
This is what worked for me that I was able to extract one point and convert it to a geometry
WITH temp_table as (select '{ "type": "MultiPoint", "coordinates": [ [ 20, 10 ], [ 30, 5 ], [ 90, 50 ], [ 40, 80 ] ] }' as string)
select ST_GEOGPOINT(CAST(JSON_EXTRACT(string, '$.coordinates[0][0]') as FLOAT64), CAST(JSON_EXTRACT(string, '$.coordinates[0][1]') as FLOAT64)) from temp_table
This results in POINT(20 10)
I can write manual queries for each of these points and do a UNION ALL but that won't scale or work every time. I want to achieve this such that it is able to do it in a automated fashion. For architectural purposes, we can't do string manipulation in languages like Python.
Below is for BigQuery Standard SQL
#standardSQL
SELECT
ARRAY(
SELECT ST_GEOGPOINT(
CAST(SPLIT(pair)[OFFSET(0)] AS FLOAT64), CAST(SPLIT(pair)[SAFE_OFFSET(1)] AS FLOAT64))
FROM UNNEST(REGEXP_EXTRACT_ALL(JSON_EXTRACT(STRING, '$.coordinates'), r'\[(\d+,\d+)\]')) pair
) points
FROM `project.dataset.temp_table`
You can test, play with above using sample data from your question as in below example
#standardSQL
WITH `project.dataset.temp_table` AS (
SELECT '{ "type": "MultiPoint", "coordinates": [ [ 20, 10 ], [ 30, 5 ], [ 90, 50 ], [ 40, 80 ] ] }' AS STRING
)
SELECT
ARRAY(
SELECT ST_GEOGPOINT(
CAST(SPLIT(pair)[OFFSET(0)] AS FLOAT64), CAST(SPLIT(pair)[SAFE_OFFSET(1)] AS FLOAT64))
FROM UNNEST(REGEXP_EXTRACT_ALL(JSON_EXTRACT(STRING, '$.coordinates'), r'\[(\d+,\d+)\]')) pair
) points
FROM `project.dataset.temp_table`
with result
Row points
1 POINT(20 10)
POINT(30 5)
POINT(90 50)
POINT(40 80)
Note: in above version - array of points is produced for each respective original row. Obviously you can adjust it to flatten as in below example
#standardSQL
WITH `project.dataset.temp_table` AS (
SELECT '{ "type": "MultiPoint", "coordinates": [ [ 20, 10 ], [ 30, 5 ], [ 90, 50 ], [ 40, 80 ] ] }' AS STRING
)
SELECT
ST_GEOGPOINT(
CAST(SPLIT(pair)[OFFSET(0)] AS FLOAT64), CAST(SPLIT(pair)[SAFE_OFFSET(1)] AS FLOAT64)
) points
FROM `project.dataset.temp_table`, UNNEST(REGEXP_EXTRACT_ALL(JSON_EXTRACT(STRING, '$.coordinates'), r'\[(\d+,\d+)\]')) pair
with result
Row points
1 POINT(20 10)
2 POINT(30 5)
3 POINT(90 50)
4 POINT(40 80)

In PostgreSQL, what's the best way to select an object from a JSONB array?

Right now, I have an an array that I'm able to select off a table.
[{"_id": 1, "count: 3},{"_id": 2, "count: 14},{"_id": 3, "count: 5}]
From this, I only need the count for a particular _id. For example, I need the count for
_id: 3
I've read the documentation but I haven't been able to figure out the correct way to get the object.
WITH test_array(data) AS ( VALUES
('[
{"_id": 1, "count": 3},
{"_id": 2, "count": 14},
{"_id": 3, "count": 5}
]'::JSONB)
)
SELECT val->>'count' AS result
FROM
test_array ta,
jsonb_array_elements(ta.data) val
WHERE val #> '{"_id":3}'::JSONB;
Result:
result
--------
5
(1 row)