Collecting distinct key value-pairs of Nested type in ClickHouse into arrays - sql

I have data with the following schema in ClickHouse:
CREATE TABLE table AS (
key String,
…
nested Nested (
key String,
value String
)
) …
Some example data:
key | … | nested |
----|---|-------------------------------|
k1 | | [{"key": "a", "value": "1"}] |
k1 | | [{"key": "a", "value": "2"}] |
k1 | | [{"key": "a", "value": "1"}, |
| | "key": "a", "value": "2"}] |
k1 | | [{"key": "b", "value": "3" |
I want to group by the key and collect all the distinct key-value pairs into two arrays:
key | nested.key | nested.value |
------|-----------------|------------------|
k1 | ["a", "a", "b"] | ["1", "2", "3"] |
What is the simplest and most efficient way to do this in ClickHouse?

I would suggest this query:
SELECT DISTINCT
key,
arrayDistinct(groupArray((nested.key, nested.value))) AS distinctNested,
arrayMap(x -> (x.1), distinctNested) AS `nested.keys`,
arrayMap(x -> (x.2), distinctNested) AS `nested.values`
FROM test.table_002
ARRAY JOIN nested
GROUP BY key
/* Result
┌─key─┬─distinctNested──────────────────┬─nested.keys───┬─nested.values─┐
│ k1 │ [('a','1'),('a','2'),('b','3')] │ ['a','a','b'] │ ['1','2','3'] │
└─────┴─────────────────────────────────┴───────────────┴───────────────┘
*/
/* Test data preparing */
CREATE TABLE test.table_002 (
key String,
nested Nested (key String, value String)
) ENGINE = Memory;
INSERT INTO test.table_002
FORMAT JSONEachRow
{"key": "k1", "nested.key":["a"], "nested.value": ["1"]}
{"key": "k1", "nested.key":["a"], "nested.value": ["2"]}
{"key": "k1", "nested.key":["a", "a"], "nested.value": ["1", "2"]}
{"key": "k1", "nested.key":["b"], "nested.value": ["3"]}

Related

Swap values between columns based on third column

I have a table like this:
src_id | src_source | dst_id | dst_source | metadata
--------------------------------------------------------
123 | A | 345 | B | some_string
234 | B | 567 | A | some_other_string
498 | A | 432 | A | another_one # this line should be ignored
765 | B | 890 | B | another_one # this line should be ignored
What I would like is:
A_id | B_id | metadata
-----------------------
123 | 345 | some string
567 | 234 | some_other_string
Here's the data to replicate:
data = [
("123", "A", "345", "B", "some_string"),
("234", "B", "567", "A", "some_other_string"),
("498", "A", "432", "A", "another_one"),
("765", "B", "890", "B", "another_two"),
]
cols = ["src_id", "src_source", "dst_id", "dst_source", "metadata"]
df = spark.createDataFrame(data).toDF(*cols)
I am a bit confused as to how to do this - I got to here:
output = (
df
.filter(F.col("src_source") != F.col("dst_source"))
.withColumn("A_id",
F.when(F.col("src_source") == "A", F.col("src_id")))
.withColumn("B_id",
F.when(F.col("src_source") == "B", F.col("src_id")))
)
I think i figured it out - I need to split the df and union again!
ab_df = (
df
.filter(F.col("src_source") != F.col("dst_source"))
.filter((F.col("src_source") == "A") & (F.col("dst_source") == "B"))
.select(F.col("src_id").alias("A_id"),
F.col("dst_id").alias("B_id"),
"metadata")
)
ba_df = (
df
.filter(F.col("src_source") != F.col("dst_source"))
.filter((F.col("src_source") == "B") & (F.col("dst_source") == "A"))
.select(F.col("src_id").alias("B_id"),
F.col("dst_id").alias("A_id"),
"metadata")
)
all = ab_df.unionByName(ba_df)
You can do it without union, just in one select, without the need to write the same filter twice.
output = (
df
.filter(F.col("src_source") != F.col("dst_source"))
.select(
F.when(F.col("src_source") == "A", F.col("src_id")).otherwise(F.col("dst_id")).alias("A_id"),
F.when(F.col("src_source") == "A", F.col("dst_id")).otherwise(F.col("src_id")).alias("B_id"),
"metadata"
)
)
output.show()
# +----+----+-----------------+
# |A_id|B_id| metadata|
# +----+----+-----------------+
# | 123| 345| some_string|
# | 567| 234|some_other_string|
# +----+----+-----------------+

Create tabular View by Spreading Data from JSON in Snowflake

I'm very new to Snowflake and I am working on creating a view from the table that holds JSON data as follows :
"data": {
"baseData": {
"dom_url": "https://www.soccertables.com/european_tables",
"event_id": "01b2722a-d8e6-4f67-95d0-8dd7ba088a4a",
"event_utc_time": "2020-05-11 09:01:14.821",
"ip_address": "125.238.134.96",
"table_1": [
{
"position": "1",
"team_name": "Liverpool",
"games_played": "29",
"games_won": "26",
"games_drawn": "2",
"games_lost": "1",
"goals_for": "75",
"goals_against": "35"
"points": "80"
},
{
"position": "2",
"team_name": "Man. City",
"games_played": "29",
"games_won": "20",
"games_drawn": "5",
"games_lost": "4",
"goals_for": "60",
"goals_against": "45"
"points": "65"
},
{
"position": "...",
"team_name": "...",
"games_played": "...",
"games_won": "...",
"games_drawn": "...",
"games_lost": "...",
"goals_for": "...",
"goals_against": "..."
"points": "..."
}
],
"unitID": "CN 8000",
"ver": "1.0.0"
},
"baseType": "MatchData"
},
"dataName": "CN8000.Prod.MatchData",
"id": "18a89f9e-9620-4453-a546-23412025e7c0",
"tags": {
"itrain.access.level1": "Private",
"itrain.access.level2": "Kumar",
"itrain.internal.deviceID": "",
"itrain.internal.deviceName": "",
"itrain.internal.encodeTime": "2022-03-23T07:41:19.000Z",
"itrain.internal.sender": "Harish",
"itrain.software.name": "",
"itrain.software.partNumber": 0,
"itrain.software.version": ""
},
"timestamp": "2021-02-25T07:32:31.000Z"
}
I want to extract the common values like dom_url, event_id, event_utc_time, ip_address along with each team_name in a separate column and the associated team details like position, games_played etc possibly in rows for each team name
E.g :
I've been trying Lateral flatten function but couldn't succeed so far
create or replace view AWSS3_PM.PUBLIC.PM_POWER_CN8000_V1(
DOM_URL,
EVENT_ID,
EVENT_UTC_TIME,
IP_ADDRESS,
TIMESTAMP,
POSITION,
GAMES_PLAYED,
GAMES_WON,
GAMES_LOST,
GAMES_DRAWN
) as
select c1:data:baseData:dom_url dom_url,
c1:data:baseData:event_id event_id,
c1:data:baseData:event_utc_time event_utc_time,
c1:data:baseData:ip_address ip_address,
c1:timestamp timestamp,
value:position TeamPosition,
value:games_played gamesPlayed,
value:games_won wins ,
value:games_lost defeats,
value:games_drawn draws
from pm_power, lateral flatten(input => c1:data:baseData:table_1);
Any help would be really grateful
Thanks,
Harish
#For the table Portion in JSON it would need flattening and transpose, example below -
Sample table -
select * from test_json;
+--------------------------------+
| TAB_VAL |
|--------------------------------|
| { |
| "table_1": [ |
| { |
| "games_drawn": "2", |
| "games_lost": "1", |
| "games_played": "29", |
| "games_won": "26", |
| "goals_against": "35", |
| "goals_for": "75", |
| "points": "80", |
| "position": "1", |
| "team_name": "Liverpool" |
| }, |
| { |
| "games_drawn": "5", |
| "games_lost": "4", |
| "games_played": "29", |
| "games_won": "20", |
| "goals_against": "45", |
| "goals_for": "60", |
| "points": "65", |
| "position": "2", |
| "team_name": "Man. City" |
| } |
| ] |
| } |
+--------------------------------+
1 Row(s) produced. Time Elapsed: 0.285s
Perform transpose after flattening JSON
select * from (
select figures,stats,team_name
from (
select
f.value:"games_drawn"::number as games_drawn,
f.value:"games_lost"::number as games_lost,
f.value:"games_played"::number as games_played,
f.value:"games_won"::number as games_won,
f.value:"goals_against"::number as goals_against,
f.value:"goals_for"::number as goals_for,
f.value:"points"::number as points,
f.value:"position"::number as position,
f.value:"team_name"::String as team_name
from
TEST_JSON, table(flatten(input=>tab_val:table_1, mode=>'ARRAY')) as f
) flt
unpivot (figures for stats in(games_drawn, games_lost, games_played, games_won, goals_against, goals_for, points,position))
) up
pivot (min(up.figures) for up.team_name in ('Liverpool','Man. City'));
+---------------+-------------+-------------+
| STATS | 'Liverpool' | 'Man. City' |
|---------------+-------------+-------------|
| GAMES_DRAWN | 2 | 5 |
| GAMES_LOST | 1 | 4 |
| GAMES_PLAYED | 29 | 29 |
| GAMES_WON | 26 | 20 |
| GOALS_AGAINST | 35 | 45 |
| GOALS_FOR | 75 | 60 |
| POINTS | 80 | 65 |
| POSITION | 1 | 2 |
+---------------+-------------+-------------+
8 Row(s) produced. Time Elapsed: 0.293s

_lodash group objects by itemColor, itemSize, shopId

I'm using _lodash library,
this is my data with objects:
[ {
"itemColor": "red",
"itemSize": "L",
"itemCount": 1,
"shopId": "shop 1",
"itemName": "product name 1",
},
{
"itemColor": "red",
"itemSize": "L",
"itemCount": 3,
"shopId": "shop 2",
"itemName": "product name 1",
},
{
"itemColor": "red",
"itemSize": "L",
"itemCount": 5,
"shopId": "shop 3",
"itemName": "product name 1",
},
{
"itemColor": "green",
"itemSize": "S",
"itemCount": 1,
"shopId": "shop 3",
"itemName": "product name 2",
}]
I need to group items by itemSize, itemColor and as result I need to have this table:
+----------------+-------+------+--------+--------+--------+
| itemName | color | size | shop 1 | shop 2 | shop 3 |
+================+=======+======+========+========+========+
| product name 1 | red | L | 1 | 3 | 5 |
+----------------+-------+------+--------+--------+--------+
| product name 2 | green | S | 0 | 0 | 1 |
+----------------+-------+------+--------+--------+--------+
If shop are no matches then I need to set 0 value.

Ability to get the "index" (or ordinal value) for each array entry in BigQuery?

In a data column in BigQuery, I have a JSON object with the structure:
{
"sections": [
{
"secName": "Flintstones",
"fields": [
{ "fldName": "Fred", "age": 55 },
{ "fldName": "Barney", "age": 44 }
]
},
{
"secName": "Jetsons",
"fields": [
{ "fldName": "George", "age": 33 },
{ "fldName": "Elroy", "age": 22 }
]
}
]}
I'm hoping to unnest() and json_extract() to get results that resemble:
id | section_num | section_name | field_num | field_name | field_age
----+--------------+--------------+-----------+------------+-----------
1 | 1 | Flintstones | 1 | Fred | 55
1 | 1 | Flintstones | 2 | Barney | 44
1 | 2 | Jetsons | 1 | George | 33
1 | 2 | Jetsons | 2 | Elroy | 22
So far, I have the query:
SELECT id,
json_extract_scalar(curSection, '$.secName') as section_name,
json_extract_scalar(curField, '$.fldName') as field_name,
json_extract_scalar(curField, '$.age') as field_age
FROM `tick8s.test2` AS tbl
LEFT JOIN unnest(json_extract_array(tbl.data, '$.sections')) as curSection
LEFT JOIN unnest(json_extract_array(curSection, '$.fields')) as curField
that yields:
id | section_name | field_name | field_age
----+--------------+------------+-----------
1 | Flintstones | Fred | 55
1 | Flintstones | Barney | 44
1 | Jetsons | George | 33
1 | Jetsons | Elroy | 22
QUESTION: I'm not sure how, if possible, to get the section_num and field_num ordinal positions from their array index values?
(If you are looking to duplicate my results, I have a table named test2 with 2 columns:
id - INTEGER, REQUIRED
data - STRING, NULLABLE
and I insert the data with:
insert into tick8s.test2 values (1,
'{"sections": [' ||
'{' ||
'"secName": "Flintstones",' ||
'"fields": [' ||
'{ "fldName": "Fred", "age": 55 },' ||
'{ "fldName": "Barney", "age": 44 }' ||
']' ||
'},' ||
'{' ||
'"secName": "Jetsons",' ||
'"fields": [' ||
'{ "fldName": "George", "age": 33 },' ||
'{ "fldName": "Elroy", "age": 22 }' ||
']' ||
'}]}'
);
)
Do you just want with offset?
SELECT id,
json_extract_scalar(curSection, '$.secName') as section_name,
n_s,
json_extract_scalar(curField, '$.fldName') as field_name,
json_extract_scalar(curField, '$.age') as field_age,
n_c
FROM `tick8s.test2` tbl LEFT JOIN
unnest(json_extract_array(tbl.data, '$.sections')
) curSection WITH OFFSET n_s LEFT JOIN
unnest(json_extract_array(curSection, '$.fields')
) curField WITH OFFSET n_c;

Nested JSON aggregation in Postgres

I have a need to run a query over a Postgres database and aggregate it and export it as a json object using native Postgres tooling.
I can't quite get the aggregation working correctly and I'm a bit stumped.
Below is an example of some of the data
| msgserial | object_type | payload_key | payload | user_id |
+-----------+---------------+-------------+-----------------------------------------------------------+---------+
| 1696962 | CampaignEmail | a8901b2c | {"id": "ff7221da", "brand": "MAGIC", "eventType": "SENT"} | 001 |
| 1696963 | OtherType | b8901b2c | {"id": "ff7221db", "brand": "MAGIC", "eventType": "SENT"} | 001 |
| 1696964 | OtherType | c8901b2c | {"id": "ff7221dc", "brand": "MAGIC", "eventType": "SENT"} | 002 |
| 1696965 | OtherType | d8901b2c | {"id": "ff7221dd", "brand": "MAGIC", "eventType": "SENT"} | 001 |
| 1696966 | CampaignEmail | e8901b2c | {"id": "ff7221de", "brand": "MAGIC", "eventType": "SENT"} | 001 |
| 1696967 | CampaignEmail | f8901b2c | {"id": "ff7221df", "brand": "MAGIC", "eventType": "SENT"} | 002 |
| 1696968 | SomethingElse | g8901b2c | {"id": "ff7221dg", "brand": "MAGIC", "eventType": "SENT"} | 001 |
+-----------+---------------+-------------+-----------------------------------------------------------+---------+
I need to output a JSON object like this grouped by user_id
{
"user_id": 001,
"brand": "MAGIC",
"campaignEmails": [
{"id": "ff7221da", "brand": "MAGIC", "eventType": "SENT"},
{"id": "ff7221de", "brand": "MAGIC", "eventType": "SENT"},
{"id": "ff7221de", "brand": "MAGIC", "eventType": "SENT"}
],
"OtherTypes": [
{"id": "ff7221db", "brand": "MAGIC", "eventType": "SENT"},
{"id": "ff7221dd", "brand": "MAGIC", "eventType": "SENT"}
],
"Somethingelses": [
{"id": "ff7221dg", "brand": "MAGIC", "eventType": "SENT"}
]
},
{
"user_id": 002,
"campaignEmails": [
],
"OtherTypes": [
],
"Somethingelses": [
]
}
Essentially need to group al the payloads into arrays by their type grouped by the user_id
I started with JSONB_BUILD_OBJECT getting one of the object_types grouped together into an array but then got stumped.
Am I trying to achieve the impossible in raw PSQL? I'm really stumped and I keep hitting errors like X needs to be included in the GROUP BY clause etc...
I can group one of the object_types into an array grouped by user_id but can't seem to do all 3
My other thinking was to do have 3 subqueries but I'm not sure how to do that either.
You need two aggregations, first one in groups by user_id, object_type and the other by user_id only:
select
jsonb_build_object('user_id', user_id)
|| jsonb_object_agg(object_type, payload) as result
from (
select user_id, object_type, jsonb_agg(payload) as payload
from my_table
group by user_id, object_type
) s
group by user_id
Db<>Fiddle.