Pivoting data into JSON with numbered fields - sql

In SQL Server 2017, I have a table as follows:
CREATE TABLE #Data
(
Code VARCHAR (2)
, RegionCode VARCHAR (10)
, Prop INT
, Val VARCHAR (200)
, PropF VARCHAR (50)
, PropFD VARCHAR (200)
)
INSERT INTO #Data
(
Code, RegionCode, Prop, Val, PropF, PropFD
)
VALUES
('AD', 'DLSO324', 1, 'Abcdefg', 'SD', 'SomeDescription')
, ('AD', 'DLSO324', 2, 'sdfadf', 'SA', 'SomethingA')
, ('AD', 'DLSO324', 3, 'gfdsdfg', 'SB', 'SomethingB')
, ('AD', 'DLSO324', 4, 'r43df', 'SC', 'SomethingC')
, ('AD', 'DLSO324', 5, 'GHD-123', 'SD2', 'SomethingD')
, ('AD', 'DLSO324', 6, '2013-03-42', 'SE', 'SomethingE')
, ('AD', 'XR1046', 34, 'Value1', 'dsf', 'Desc1')
, ('AD', 'XR1046', 65, 'Value1', 'gfsd', 'Desc1')
, ('AD', 'XR1046', 23, 'Value1', 'dg', 'Desc1')
, ('AD', 'XR1046', 67, 'Value1', 'fgh', 'Desc1')
, ('AD', 'XR1046', 45, 'Value1', 'fh', 'Desc1')
, ('AD', 'XR1046', 99, 'Value1', 'hfgfgh', 'Desc1')
SELECT *
FROM #Data
where you'll notice that a code and region code has multiple props with each prop having a value (val), a property code (propF), and a property field description (PropFD). The number of properties a Code and RegionCode combination can have varies from anywhere between 1 and 100 and different combinations of Code and RegionCode can have different PropF and PropFD values even if they share the same prop number.
What I need to do is write a query that pivots the data and produces one row per Code and RegionCode with some JSON data. I need to completely flatten out the JSON so that each Prop number has its own Val, PropF, and PropFD field. My desired structure is as follow (you'll notice that the _number corresponds to the prop value in the #Data table):
[
{
"Val_1": "Abcdefg",
"PropF_1": "SD",
"PropFD_1": "SomeDescription",
"Val_2": "sdfadf",
"PropF_2": "SA",
"PropFD_2": "SomethingA",
"Val_3": "gfdsdfg",
"PropF_3": "SB",
"PropFD_3": "SomethingB",
"Val_4": "r43df",
"PropF_4": "SC",
"PropFD_4": "SomethingC",
"Val_5": "GHD-123",
"PropF_5": "SD2",
"PropFD_5": "SomethingD",
"Val_6": "2013-03-42",
"PropF_6": "SE",
"PropFD_6": "SomethingE"
}
]
So far I have the following query:
SELECT x.Code
, x.RegionCode
, ( SELECT y.Prop id
, y.Val
, y.PropF
, y.PropFD
FROM #Data y
WHERE y.Code = x.Code
AND y.RegionCode = x.RegionCode
FOR JSON PATH) FieldData
FROM #Data x
GROUP BY x.Code
, x.RegionCode
Is there a way for me to get my desired structure using JOINs and the SQL Server 2017 JSON functions? I want to avoid using PIVOT if possible due to performance reasons.

Since SQL Server is declarative by design, your desired results would require either Dynamic SQL or some String Manipulation.
The following demonstrates a little string manipulation in concert with string_agg()
Example
SELECT Code
,RegionCode
,FieldData = '[{'+string_agg(concat('"Val_',prop,'":"',Val,'","PropF_',Prop,'":"',PropF,'","PropFD_',Prop,'":"',PropFD,'"'),',')+'}]'
FROM #Data
Group By Code,RegionCode
Results
Results First Record's JSON
[
{
"Val_1": "Abcdefg",
"PropF_1": "SD",
"PropFD_1": "SomeDescription",
"Val_2": "sdfadf",
"PropF_2": "SA",
"PropFD_2": "SomethingA",
"Val_3": "gfdsdfg",
"PropF_3": "SB",
"PropFD_3": "SomethingB",
"Val_4": "r43df",
"PropF_4": "SC",
"PropFD_4": "SomethingC",
"Val_5": "GHD-123",
"PropF_5": "SD2",
"PropFD_5": "SomethingD",
"Val_6": "2013-03-42",
"PropF_6": "SE",
"PropFD_6": "SomethingE"
}
]
The Second Record's JSON
[
{
"Val_34": "Value1",
"PropF_34": "dsf",
"PropFD_34": "Desc1",
"Val_65": "Value1",
"PropF_65": "gfsd",
"PropFD_65": "Desc1",
"Val_23": "Value1",
"PropF_23": "dg",
"PropFD_23": "Desc1",
"Val_67": "Value1",
"PropF_67": "fgh",
"PropFD_67": "Desc1",
"Val_45": "Value1",
"PropF_45": "fh",
"PropFD_45": "Desc1",
"Val_99": "Value1",
"PropF_99": "hfgfgh",
"PropFD_99": "Desc1"
}
]

Related

How can I put SELECT result set to SELECT column?

I have two tables: Job(ID,Name, etc.) and Address(ID, Job_ID, Name etc). I want to get result like this:
[
{
"Job_ID": 1,
"JobName": "Test",
"Addresses": [
{
"ID": 1,
"Name": "King street"
},
{
"ID": 2,
"Name": "Queen`s street
}
]
}
]
My current query that gets only one address for a job looks like this:
SELECT TOP 100
JO.ID,
JO.Closed as Deleted,
JO.Number as JobNumber,
JO.Name as JobName,
Convert(date, JO.Start_Date) as Start_Date,
JO.Job_Status_ID as Status,
A.ID as Address_ID,
A.Name as Name,
A.Number as Number,
A.Sort_Name as Sort_Name,
A.Address_1 as Address_1,
A.Address_2 as Address_2,
A.ZipCode as ZIP,
A.E_Mail_Address as Email,
A.Web_Site_URL as Web_Site_URL,
A.TAXRATE as Tax_Rate,
A.State
FROM Job JO
INNER JOIN Address A ON A.Job_Id = JO.ID
Is it possible without pivot table(Address_ID, Job_ID)?
You can use FOR JSON to convert you results to JSON. This gives the result you are looking for:
CREATE TABLE #Job (ID INT NOT NULL, Name VARCHAR(50));
INSERT #Job (ID, Name)
VALUES (1, 'Job 1'), (2, 'Job 2');
CREATE TABLE #Address (ID INT NOT NULL, JobID INT NOT NULL, Name VARCHAR(50));
INSERT #Address (ID, JobID, Name)
VALUES (1, 1, 'King street'), (2, 1, 'Queen''s street'), (3, 2, 'Address 3'), (4, 2, 'Address 4');
SELECT JobID = j.ID,
JobName = j.Name,
Addresses = ( SELECT a.ID, a.Name
FROM #Address AS a
WHERE a.JobID = j.ID
FOR JSON AUTO
)
FROM #Job AS j
FOR JSON AUTO;

SNOWFLAKE : read a VARIANT column as a table?

We have the following table
WITH fake_data(columnA, columnB, columnC) as (
select * from values
(1, 'hello1', 'world18'),
(1, 'hello2', 'world27'),
(2, 'hello9', 'world36')
(3, NULL, 'world35')
(10, 'hello13', 'world5')
)
We convert the entire table into a single column that has a JSON-like structure
CREATE OR REPLACE TEMPORARY TABLE LISTE_JSON (V variant)
AS
WITH COLONNE_KEY
AS (
SELECT
ROW_NUMBER () OVER (ORDER BY columnA DESC) KEY_AUTO
,A.*
FROM fake_data A
),
COLONNE_OBJECT
AS (
SELECT
object_agg(
TO_CHAR(KEY_AUTO ) ,
object_construct(
'columnA', IFNULL(columnA,''),
'columnB', IFNULL(columnB,''),
'columnC', IFNULL(columnC,''),
)
)AS COLONNE_OBJECT
FROM COLONNE_KEY
)
SELECT *
FROM COLONNE_OBJECT;
So far everything is going well.
Now how do I read the variant column through a SELECT and see it as a table, as it was at the beginning?
Ex:
SELECT *
FROM LISTE_JSON
COLUMNA COLUMNB COLUMNC
1 hello1 world18
1 hello2 world27
2 hello9 world36
3 '' world35
10 hello13 world5
You can ether use PIVOT to pull with parts, or you can hand roll the pivot via GROUP BY
SELECT
columna
,max(iff(columnb='hello1', columnc, null)) as hello1
,max(iff(columnb='hello2', columnc, null)) as hello2
,max(iff(columnb='hello3', columnc, null)) as hello3
from table
group by 1 order by 1;
So lets start with working "example code"
WITH fake_data(columnA, columnB, columnC) as (
select * from values
(1, 'hello1', 'world18'),
(1, 'hello2', 'world27'),
(2, 'hello9', 'world36'),
(3, NULL, 'world35'),
(10, 'hello13', 'world5')
), COLONNE_KEY AS (
SELECT
ROW_NUMBER () OVER (ORDER BY columnA DESC) KEY_AUTO
,A.*
FROM fake_data A
), COLONNE_OBJECT AS (
SELECT
object_agg( KEY_AUTO::text ,
object_construct('columnA', IFNULL(columnA::text,''),
'columnB', IFNULL(columnB::text,''),
'columnC', IFNULL(columnC::text,'')
)
)AS COLONNE_OBJECT
FROM COLONNE_KEY
)
SELECT *
FROM COLONNE_OBJECT;
gives:
COLONNE_OBJECT
{ "1": { "columnA": "10", "columnB": "hello13", "columnC": "world5" }, "2": { "columnA": "3", "columnB": "", "columnC": "world35" }, "3": { "columnA": "2", "columnB": "hello9", "columnC": "world36" }, "4": { "columnA": "1", "columnB": "hello1", "columnC": "world18" }, "5": { "columnA": "1", "columnB": "hello2", "columnC": "world27" } }
which you would like to get back into it original table form
thus
SELECT
f.value:"columnA"::number as columna,
f.value:"columnB"::text as columnb,
f.value:"columnC"::text as columnc
FROM COLONNE_OBJECT, table(flatten(input=>colonne_object)) f;
gives you back
COLUMNA
COLUMNB
COLUMNC
10
hello13
world5
3
<empty string>
world35
2
hello9
world36
1
hello1
world18
1
hello2
world27
and the empty string can be swapped back in via
nullif(f.value:"columnB"::text,'') as columnb,

Select within Structs within Arrays in SQL

I'm trying to find rows with N count of identifier A AND M count of identifier B in an array of structs within a Google BigQuery table, using the new Standard SQL. The data in the table (simplified) where each row looks a bit like this:
{
"Session": "abc123",
"Information" [
{
"Identifier": "A",
"Count": 1,
},
{
"Identifier": "B"
"Count": 2,
},
{
"Identifier": "C"
"Count": 3,
}
...
]
}
I've been struggling to work with the struct in an array. Any way I can do that?
Below is for BigQuery Standard SQL
#standardSQL
SELECT *
FROM `project.dataset.table`
WHERE 2 = (SELECT COUNT(1) FROM UNNEST(information) kv WHERE kv IN (('a', 5), ('b', 10)))
If to apply to dummy data as in below example
#standardSQL
WITH `project.dataset.table` AS (
SELECT 'abc123' session, [STRUCT('a' AS identifier, 1 AS `count`), ('b', 2), ('c', 3)] information UNION ALL
SELECT 'abc456', [('a', 5), ('b', 10), ('c', 20)]
)
SELECT *
FROM `project.dataset.table`
WHERE 2 = (SELECT COUNT(1) FROM UNNEST(information) kv WHERE kv IN (('a', 5), ('b', 10)))
result is
Row session information.identifier information.count
1 abc456 a 5
b 10
c 20

Parsing string in a hive table

I have a hive table which has two columns (day, type_of_day) both of type string
"monday" [{"temp" : 45, "weather": "rainny"}, {"temp" : 25, "weather": "sunny"}, {"temp" : 15, "weather": "storm"}]
"tuesday" [{"temp" : 5, "weather": "winter"}, {"temp" : 10, "weather": "sun"}, {"temp" : 18, "weather": "dawn"}]
I wanna split ( I guess explode is the technical term) and then just get a list of weather for each day. I'm familiar with how to do this in python but is there a way to directly do this in hive.
"monday" [45, 25, 15]
"tuesday" [5, 10, 18]
Testing with your data example. Replace CTE with your table. Read comments in the code:
with your_table as (--use your table instead of this CTE
select stack(2,
"monday",'[{"temp" : 45, "weather": "rainny"}, {"temp" : 25, "weather": "sunny"}, {"temp" : 15, "weather": "storm"}]',
"tuesday" ,'[{"temp" : 5, "weather": "winter"}, {"temp" : 10, "weather": "sun"}, {"temp" : 18, "weather": "dawn"}]'
)as (day, type_of_day)
) --use your table instead of this CTE
select s.day, array(get_json_object(type_of_day_array[0],'$.temp'),
get_json_object(type_of_day_array[1],'$.temp'),
get_json_object(type_of_day_array[2],'$.temp')
) as result_array --extract JSON elements and construct array
from
(
select day, split(regexp_replace(regexp_replace(type_of_day,'\\[|\\]',''), --remove square brackets
'\\}, *\\{','\\}##\\{'), --make convenient split separator
'##') --split
as type_of_day_array
from your_table --use your table instead of this CTE
)s;
Result:
s.day result_array
monday ["45","25","15"]
tuesday ["5","10","18"]
If the array of JSON can contain more than three elements, then you can use lateral view explode or posexplode and then build the resulting array like in this answer: https://stackoverflow.com/a/51570035/2700344.
Wrap array elements in cast(... as int) if you need array<int> as a result instead of array<string>:
cast(get_json_object(type_of_day[0],'$.temp') as int)...

folding over large BigQuery result

Is there any easy way for me to do something like Ocaml's fold_left on a result of a BigQuery query, where each iteration corresponds to one row in the result?
What product or approach would be the easiest way? It would be great if:
all I need to do is to supply the initial state and the 'folder' function
preferably, I'd like to write the 'folder' function in a functional language
I don't need to install any GCP package
Since I don't know which product or language would work, I cannot be more specific, but pseudocode would be like:
let my_init = []
let my_folder = fun state row ->
// append for now, but it will be complicated. I need to do some set operations here. The point is that I need some way of transferring "state" across rows, when I iterate over rows in a predefined order.
row.col1 :: state
let query = "SELECT col1, col2, col3 FROM table1 ORDER BY timestamp"
query |> List.fold my_folder my_init
The result that I want to get from this simplified example is the final "state".
--- UPDATED ---
There is no bound on the number of rows---if we receive more, we get more rows. Typically, the number is more than a few millions but it can be larger than that.
Here's a simplified example that shows the major problem I'm encountering. We have a table with a few columns:
timestamp
user_id: a string id
operation_json: a stringified JSON object, which is a list of operations, each of which corresponds to either:
add user_id to a set
remove user_id from a set
For example, the followings are valid rows:
----------+---------+----------------------------------------------
timestamp | user_id | operation_json
----------+---------+----------------------------------------------
1 | id1 | [ { "op": "add", "set": "set1" } ]
2 | id2 | [ { "op": "add", "set": "set1" } ]
3 | id1 | [ { "op": "add", "set": "set2" } ]
4 | id3 | [ { "op": "add", "set": "set2" } ]
5 | id1 | [ { "op": "remove", "set": "set1" } ]
----------+---------+----------------------------------------------
As a result, I'd like to get sets of users; i.e.,
set1 |-> { id2 }
set2 |-> { id1, id3 }
I thought fold_left-like operation would be convenient. The state would be map>, and the initial-state would be an empty map.
Below [quick and simple] example for BigQuery Standard SQL
#standardSQL
CREATE TEMP FUNCTION fold(arr ARRAY<INT64>, init INT64)
RETURNS FLOAT64
LANGUAGE js AS """
const reducer = (accumulator, currentValue) => accumulator + parseInt(currentValue);
return arr.reduce(reducer, 5);
""";
WITH `project.dataset.table` AS (
SELECT 1 id, [1, 2, 3, 4] arr, 5 initial_state UNION ALL
SELECT 2, [1, 2, 3, 4, 5, 6, 7], 10
)
SELECT id, fold(arr, initial_state) result
FROM `project.dataset.table`
output is
Row id result
1 1 15.0
2 2 33.0
I think it is self-explanatory enough
See more for JS UDF
folding list of rows
See below extension of above
Here you are assembling array from the result's rows before applying fold function (of course you have some limits for UDF here to have in mind and also on how big your ARRAY of rows can go, etc.
#standardSQL
CREATE TEMP FUNCTION fold(arr ARRAY<INT64>, init INT64)
RETURNS FLOAT64
LANGUAGE js AS """
const reducer = (accumulator, currentValue) => accumulator + parseInt(currentValue);
return arr.reduce(reducer, 5);
""";
WITH `project.dataset.table` AS (
SELECT 1 id, 1 item UNION ALL
SELECT 1, 2 UNION ALL
SELECT 1, 3 UNION ALL
SELECT 1, 4 UNION ALL
SELECT 2, 1 UNION ALL
SELECT 2, 2 UNION ALL
SELECT 2, 3 UNION ALL
SELECT 2, 4 UNION ALL
SELECT 2, 5 UNION ALL
SELECT 2, 6 UNION ALL
SELECT 2, 7
)
SELECT id, fold(ARRAY_AGG(item), 5) result
FROM `project.dataset.table`
GROUP BY id
Note, if you need to include more than one field from each row - you can use ARRAY of STRUCT as below example
ARRAY_AGG(STRUCT(id , item) ORDER by id)
Of course, you will need to adjust respectively signature of fold UDF
For example:
#standardSQL
CREATE TEMP FUNCTION fold(arr ARRAY<STRUCT<id INT64, item INT64>>, init INT64)
RETURNS FLOAT64
LANGUAGE js AS """
const reducer = (accumulator, currentValue) => accumulator + parseInt(currentValue.item);
return arr.reduce(reducer, 5);
""";
WITH `project.dataset.table` AS (
SELECT 1 id, 1 item UNION ALL
SELECT 1, 2 UNION ALL
SELECT 1, 3 UNION ALL
SELECT 1, 4 UNION ALL
SELECT 2, 1 UNION ALL
SELECT 2, 2 UNION ALL
SELECT 2, 3 UNION ALL
SELECT 2, 4 UNION ALL
SELECT 2, 5 UNION ALL
SELECT 2, 6 UNION ALL
SELECT 2, 7
)
SELECT id, fold(ARRAY_AGG(t), 5) result
FROM `project.dataset.table` t
GROUP BY id
Below approach has nothing to do with folding per se, but rather attempt to translate your challenge into set-based one (which is more natural for when you dealing with sql) by identifying the latest op action for each user per set and if it is "remove" just eliminate that user from further consideration - if it is "add" just use the latest "add" for that user / set. This in assumption that there cannot be multiple consecutive "add" action for the same user / set - rather - it can be add /remove / add and so on. of course this can be further adjusted based on real use case
So having above in mind - below example for BigQuery Standard SQL
#standardSQL
WITH `project.dataset.table` AS (
SELECT 1 ts, 'id1' user_id, '[ { "op": "add", "set": "set1" } ]' operation_json UNION ALL
SELECT 2, 'id2', '[ { "op": "add", "set": "set1" } ]' UNION ALL
SELECT 3, 'id1', '[ { "op": "add", "set": "set2" } ]' UNION ALL
SELECT 4, 'id3', '[ { "op": "add", "set": "set2" } ]' UNION ALL
SELECT 5, 'id1', '[ { "op": "remove", "set": "set1" } ]'
)
SELECT bin, STRING_AGG(user_id, ',' ORDER BY ts) result
FROM (
SELECT user_id, bin, ARRAY_AGG(ts ORDER BY ts DESC LIMIT 1)[OFFSET(0)] ts
FROM (
SELECT ts, user_id, op, bin, LAST_VALUE(op) OVER(win) fin
FROM (
SELECT ts, user_id,
JSON_EXTRACT_SCALAR(REGEXP_REPLACE(operation_json, r'^\[|\]$', ''), '$.op') op,
JSON_EXTRACT_SCALAR(REGEXP_REPLACE(operation_json, r'^\[|\]$', ''), '$.set') bin
FROM `project.dataset.table`
)
WINDOW win AS (
PARTITION BY user_id, bin
ORDER BY ts
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
)
WHERE fin = 'add'
GROUP BY user_id, bin
)
GROUP BY bin
-- ORDER BY bin
output is
Row bin result
1 set1 id2
2 set2 id1,id3
if to apply to below dummy data
WITH `project.dataset.table` AS (
SELECT 1 ts, 'id1' user_id, '[ { "op": "add", "set": "set1" } ]' operation_json UNION ALL
SELECT 2, 'id2', '[ { "op": "add", "set": "set1" } ]' UNION ALL
SELECT 3, 'id1', '[ { "op": "add", "set": "set2" } ]' UNION ALL
SELECT 4, 'id3', '[ { "op": "add", "set": "set2" } ]' UNION ALL
SELECT 5, 'id1', '[ { "op": "remove", "set": "set1" } ]' UNION ALL
SELECT 6, 'id1', '[ { "op": "add", "set": "set1" } ]' UNION ALL
SELECT 7, 'id1', '[ { "op": "remove", "set": "set1" } ]' UNION ALL
SELECT 8, 'id1', '[ { "op": "add", "set": "set1" } ]' UNION ALL
SELECT 9, 'id1', '[ { "op": "remove", "set": "set2" } ]' UNION ALL
SELECT 10, 'id1', '[ { "op": "add", "set": "set2" } ]'
)
result will be
Row bin result
1 set1 id2,id1
2 set2 id3,id1