Ability to get the "index" (or ordinal value) for each array entry in BigQuery? - sql

In a data column in BigQuery, I have a JSON object with the structure:
{
"sections": [
{
"secName": "Flintstones",
"fields": [
{ "fldName": "Fred", "age": 55 },
{ "fldName": "Barney", "age": 44 }
]
},
{
"secName": "Jetsons",
"fields": [
{ "fldName": "George", "age": 33 },
{ "fldName": "Elroy", "age": 22 }
]
}
]}
I'm hoping to unnest() and json_extract() to get results that resemble:
id | section_num | section_name | field_num | field_name | field_age
----+--------------+--------------+-----------+------------+-----------
1 | 1 | Flintstones | 1 | Fred | 55
1 | 1 | Flintstones | 2 | Barney | 44
1 | 2 | Jetsons | 1 | George | 33
1 | 2 | Jetsons | 2 | Elroy | 22
So far, I have the query:
SELECT id,
json_extract_scalar(curSection, '$.secName') as section_name,
json_extract_scalar(curField, '$.fldName') as field_name,
json_extract_scalar(curField, '$.age') as field_age
FROM `tick8s.test2` AS tbl
LEFT JOIN unnest(json_extract_array(tbl.data, '$.sections')) as curSection
LEFT JOIN unnest(json_extract_array(curSection, '$.fields')) as curField
that yields:
id | section_name | field_name | field_age
----+--------------+------------+-----------
1 | Flintstones | Fred | 55
1 | Flintstones | Barney | 44
1 | Jetsons | George | 33
1 | Jetsons | Elroy | 22
QUESTION: I'm not sure how, if possible, to get the section_num and field_num ordinal positions from their array index values?
(If you are looking to duplicate my results, I have a table named test2 with 2 columns:
id - INTEGER, REQUIRED
data - STRING, NULLABLE
and I insert the data with:
insert into tick8s.test2 values (1,
'{"sections": [' ||
'{' ||
'"secName": "Flintstones",' ||
'"fields": [' ||
'{ "fldName": "Fred", "age": 55 },' ||
'{ "fldName": "Barney", "age": 44 }' ||
']' ||
'},' ||
'{' ||
'"secName": "Jetsons",' ||
'"fields": [' ||
'{ "fldName": "George", "age": 33 },' ||
'{ "fldName": "Elroy", "age": 22 }' ||
']' ||
'}]}'
);
)

Do you just want with offset?
SELECT id,
json_extract_scalar(curSection, '$.secName') as section_name,
n_s,
json_extract_scalar(curField, '$.fldName') as field_name,
json_extract_scalar(curField, '$.age') as field_age,
n_c
FROM `tick8s.test2` tbl LEFT JOIN
unnest(json_extract_array(tbl.data, '$.sections')
) curSection WITH OFFSET n_s LEFT JOIN
unnest(json_extract_array(curSection, '$.fields')
) curField WITH OFFSET n_c;

Related

Swap values between columns based on third column

I have a table like this:
src_id | src_source | dst_id | dst_source | metadata
--------------------------------------------------------
123 | A | 345 | B | some_string
234 | B | 567 | A | some_other_string
498 | A | 432 | A | another_one # this line should be ignored
765 | B | 890 | B | another_one # this line should be ignored
What I would like is:
A_id | B_id | metadata
-----------------------
123 | 345 | some string
567 | 234 | some_other_string
Here's the data to replicate:
data = [
("123", "A", "345", "B", "some_string"),
("234", "B", "567", "A", "some_other_string"),
("498", "A", "432", "A", "another_one"),
("765", "B", "890", "B", "another_two"),
]
cols = ["src_id", "src_source", "dst_id", "dst_source", "metadata"]
df = spark.createDataFrame(data).toDF(*cols)
I am a bit confused as to how to do this - I got to here:
output = (
df
.filter(F.col("src_source") != F.col("dst_source"))
.withColumn("A_id",
F.when(F.col("src_source") == "A", F.col("src_id")))
.withColumn("B_id",
F.when(F.col("src_source") == "B", F.col("src_id")))
)
I think i figured it out - I need to split the df and union again!
ab_df = (
df
.filter(F.col("src_source") != F.col("dst_source"))
.filter((F.col("src_source") == "A") & (F.col("dst_source") == "B"))
.select(F.col("src_id").alias("A_id"),
F.col("dst_id").alias("B_id"),
"metadata")
)
ba_df = (
df
.filter(F.col("src_source") != F.col("dst_source"))
.filter((F.col("src_source") == "B") & (F.col("dst_source") == "A"))
.select(F.col("src_id").alias("B_id"),
F.col("dst_id").alias("A_id"),
"metadata")
)
all = ab_df.unionByName(ba_df)
You can do it without union, just in one select, without the need to write the same filter twice.
output = (
df
.filter(F.col("src_source") != F.col("dst_source"))
.select(
F.when(F.col("src_source") == "A", F.col("src_id")).otherwise(F.col("dst_id")).alias("A_id"),
F.when(F.col("src_source") == "A", F.col("dst_id")).otherwise(F.col("src_id")).alias("B_id"),
"metadata"
)
)
output.show()
# +----+----+-----------------+
# |A_id|B_id| metadata|
# +----+----+-----------------+
# | 123| 345| some_string|
# | 567| 234|some_other_string|
# +----+----+-----------------+

Flatten JSON Object in Snowflake

I have a question about using the flatten function in Snowflake. I'm having trouble with extracting data from following path data:performance: of the following JSON-object:
{
"data": {
"metadata": {
"id": "001",
"created_at": "2020-01-01"
},
"performance": {
"2020-01-01": {
"ad_performances": [{
"ad": "XoGKkgcy7V3BDm6m",
"ad_impressions": 1,
"clicks": 0,
"device": "-3",
"total_net_amount": 0
}, {
"ad": "XoGKkgmFlHa3V5xj",
"ad_impressions": 17,
"clicks": 0,
"device": "-4",
"total_net_amount": 0
}, {
"ad": "XoGKkgmFlHa3V5xj",
"ad_impressions": 5,
"clicks": 0,
"device": "-5",
"total_net_amount": 0
}, {
"ad": "XoGKkgcy7V3BDm6m",
"ad_impressions": 19,
"clicks": 0,
"device": "-2",
"total_net_amount": 0
}, {
"ad": "XoGKkgcy7V3BDm6m",
"ad_impressions": 5,
"clicks": 0,
"device": "-1",
"total_net_amount": 0
}]
}
}
}
Desired result is a table with the "date" (2020-01-01), "ad" and "impressions".
I tried to achieve the desired result with:
select
key::date as date
,f.value:performances:ad as performances_array
,f.value:performances:impressions as performances_array
from <table>, lateral flatten (input => CLMN:performances) f;
but I´m not able to extract data from the "performance-array". Can someone help me out?
Thank you!
Can you try this one?
select f.KEY date,
l.VALUE:"ad" as performances_array,
l.VALUE:"impressions" as performances_array
from mydata, lateral flatten (input => CLMN:data.performance ) f,
lateral flatten (input => f.VALUE ) s,
lateral flatten (input => s.VALUE ) l
;
+------------+--------------------+--------------------+
| DATE | PERFORMANCES_ARRAY | PERFORMANCES_ARRAY |
+------------+--------------------+--------------------+
| 2020-01-01 | "XoGKkgcy7V3BDm6m" | 1 |
| 2020-01-01 | "XoGKkgmFlHa3V5xj" | 17 |
| 2020-01-01 | "XoGKkgmFlHa3V5xj" | |
| 2020-01-01 | "XoGKkgcy7V3BDm6m" | 19 |
| 2020-01-01 | "XoGKkgcy7V3BDm6m" | 5 |
+------------+--------------------+--------------------+
Only 2 LATERAL FLATTENs are required to extract the rows
select
a.key::date as ad_date,
b.value:ad::varchar as ad,
b.value:ad_impressions::int as impressions
from j
, lateral flatten(input => v:data:performance) a
, lateral flatten(input => a.value:ad_performances) b;
AD_DATE
AD
IMPRESSIONS
2020-01-01
XoGKkgcy7V3BDm6m
1
2020-01-01
XoGKkgmFlHa3V5xj
17
2020-01-01
XoGKkgmFlHa3V5xj
5
2020-01-01
XoGKkgcy7V3BDm6m
19
2020-01-01
XoGKkgcy7V3BDm6m
5
If you want to aggregate the data by ad date and ad,
with r as
(
select
a.key::date as ad_date,
b.value:ad::varchar as ad,
b.value:ad_impressions::int as impressions
from j
, lateral flatten(input => v:data:performance) a
, lateral flatten(input => a.value:ad_performances) b
)
select ad_date, ad, sum(impressions) as impressions
from r
group by ad_date, ad;
AD_DATE
AD
IMPRESSIONS
2020-01-01
XoGKkgcy7V3BDm6m
25
2020-01-01
XoGKkgmFlHa3V5xj
22

Convert a json which is a list of dictionaries into column/row format in Postgresql

I´ve a json which is a list of dictionaries with the next syntax:
[
{
"Date_and_Time": "Dec 29, 2017 15:35:37",
"Componente": "Bar",
"IP_Origen": "175.11.13.6",
"IP_Destino": "81.18.119.864",
"Country": "Brazil",
"Age": "3"
},
{
"Date_and_Time": "Dec 31, 2017 17:35:37",
"Componente": "Foo",
"IP_Origen": "176.11.13.6",
"IP_Destino": "80.18.119.864",
"Country": "France",
'Id': '123456',
'Car': 'Ferrari'
},
{
"Date_and_Time": "Dec 31, 2017 17:35:37",
"Age": "1",
"Country": "France",
'Id': '123456',
'Car': 'Ferrari'
},
{
"Date_and_Time": "Mar 31, 2018 14:35:37",
"Componente": "Foo",
"Country": "Germany",
'Id': '2468',
'Genre': 'Male'
}
]
The json is really big and each dictionary have different amount of key/values fields. And what I want to do is to create a table in postgresSQL where the key represents a column and the value a row. In the example explained above I would like table like this:
Date_and_Time | Componente | IP_Origen | IP_Destino | Country| Id | Car | Age| Genre
Dec 29, 2017 15:35:37 | Bar | 175.11.13.6 | 81.18.119.864 | Brazil | - | - | 3 | -
Dec 31, 2017 17:35:37 | Foo | 176.11.13.6 | 80.18.119.864 | France |123456 |Ferrari | - | -
Dec 31, 2017 17:35:37 | - | - | - | France |123456 |Ferrari | 1 | -
Mar 31, 2018 14:35:37 | Foo | - | - | Germany| 2468 | - | - | Male
The only solution I can think is putting the values one by one but this is no efficient at all
You can use jsonb_to_recordset to create record set out of your json and then use insert into to insert the records.
insert into table
select * from jsonb_to_recordset('<your json>'::jsonb)
as rec(Date_and_Time datetime, Componente text, IP_Origen text) --Specify all columns inside the table
Sample DBFiddle

Query both sides of a friend relationship in SQL

I have my Postgres table set up as such:
CREATE TABLE "Friendships" (
id integer DEFAULT PRIMARY KEY,
"fromUserId" integer NOT NULL REFERENCES "Users"(id),
"toUserId" integer NOT NULL REFERENCES "Users"(id)
);
When a user fetches their friendships, I run: SELECT * FROM Friendships WHERE fromUserId=XXXX.
How do I modify the query so that additional data is added to the results (true/false) based on whether toUserId also added that user?
Example result:
[
{ id: 444, fromUserId: 1, toUserId: 22, addedBack: false },
{ id: 445, fromUserId: 1, toUserId: 67, addedBack: true },
{ id: 446, fromUserId: 1, toUserId: 599, addedBack: true },
{ id: 447, fromUserId: 1, toUserId: 733, addedBack: false },
]
With EXISTS:
select f.*,
exists (
select 0 from "Friendships"
where "toUserId" = f."fromUserId" and "fromUserId" = f."toUserId"
) addedBack
from "Friendships" f
where f."fromUserId" = 1
For this sample data:
INSERT INTO "Friendships"(id, "fromUserId", "toUserId") VALUES
(444, 1, 22), (445, 1, 67), (446, 1, 599), (447, 1, 733),
(448, 67, 1), (449, 599, 1);
Results:
> id | fromUserId | toUserId | addedback
> --: | ---------: | -------: | :--------
> 444 | 1 | 22 | f
> 445 | 1 | 67 | t
> 446 | 1 | 599 | t
> 447 | 1 | 733 | f
See the demo.
You can use a left outer join to check for the reciprocate value:
select
f.id,
f.fromuserid,
f.touserid,
case when r.id is null then false else true end as addedback
from friendships f
left join friendships r on f.touserid = r.fromuserid
and r.touserid = f.fromuserid
where f.fromuserid = XXXX

How to create nested JSON return with aggregate function and dynamic key values using `jsonb_build_object`

This is what and example of the table looks like.
+---------------------+------------------+------------------+
| country_code | region | num_launches |
+---------------------+------------------+------------------+
| 'CA' | 'Ontario' | 5 |
+---------------------+------------------+------------------+
| 'CA' | 'Quebec' | 9 |
+---------------------+------------------+------------------+
| 'DE' | 'Bavaria' | 15 |
+---------------------+------------------+------------------+
| 'DE' | 'Saarland' | 12 |
+---------------------+------------------+------------------+
| 'DE' | 'Berlin' | 23 |
+---------------------+------------------+------------------+
| 'JP' | 'Tokyo' | 19 |
+---------------------+------------------+------------------+
I am able to write a query that returns each country_code with all regions nested within, but I am unable to get exactly what I am looking for.
My intended return looks like.
[
{ 'CA': [
{ 'Ontario': 5 },
{ 'Quebec': 9 }
]
},
{ 'DE': [
{ 'Bavaria': 15 },
{ 'Saarland': 12 },
{ 'Berlin': 23 }
]
},
{ 'JP': [
{ 'Tokyo': 19 }
]
}
]
How could this be calculated if the num_launches was not available?
+---------------------+------------------+
| country_code | region |
+---------------------+------------------+
| 'CA' | 'Ontario' |
+---------------------+------------------+
| 'CA' | 'Ontario' |
+---------------------+------------------+
| 'CA' | 'Ontario' |
+---------------------+------------------+
| 'CA' | 'Quebec' |
+---------------------+------------------+
| 'CA' | 'Quebec' |
+---------------------+------------------+
| 'DE' | 'Bavaria' |
+---------------------+------------------+
| 'DE' | 'Bavaria' |
+---------------------+------------------+
| 'DE' | 'Bavaria' |
+---------------------+------------------+
| 'DE' | 'Bavaria' |
+---------------------+------------------+
| 'DE' | 'Saarland' |
+---------------------+------------------+
| 'DE' | 'Berlin' |
+---------------------+------------------+
| 'DE' | 'Berlin' |
+---------------------+------------------+
| 'JP' | 'Tokyo' |
+---------------------+------------------+
Expected Return
[
{ 'CA': [
{ 'Ontario': 3 },
{ 'Quebec': 2 }
]
},
{ 'DE': [
{ 'Bavaria': 4 },
{ 'Saarland': 1 },
{ 'Berlin': 2 }
]
},
{ 'JP': [
{ 'Tokyo': 1 }
]
}
]
Thanks
You can try to use json_agg with json_build_object function in a subquery to get the array then do it again in the main query.
Schema (PostgreSQL v9.6)
CREATE TABLE T(
country_code varchar(50),
region varchar(50),
num_launches int
);
insert into t values ('CA','Ontario',5);
insert into t values ('CA','Quebec',9);
insert into t values ('DE','Bavaria',15);
insert into t values ('DE','Saarland',12);
insert into t values ('DE','Berlin',23);
insert into t values ('JP','Tokyo',19);
Query #1
select json_agg(json_build_object(country_code,arr)) results
from (
SELECT country_code,
json_agg(json_build_object(region,num_launches)) arr
FROM T
group by country_code
) t1;
results
[{"CA":[{"Ontario":5},{"Quebec":9}]},{"DE":[{"Bavaria":15},{"Saarland":12},{"Berlin":23}]},{"JP":[{"Tokyo":19}]}]
View on DB Fiddle