postgres group by common id in array on many rows - sql

I have a table like this:
{
"type": "FeatureCollection",
"name": "test",
"crs": { "type": "name", "properties": { "name": "urn:ogc:def:crs:OGC:1.3:CRS84" } },
"features": [
{ "type": "Feature", "properties": { "final_zone": "ECZ", "id": 20753, "ids": "{22210,10959,22209,22213}", "sub": "{ECZECSZ,NULL,ECZECSZ,ECZECSZ}" }, "geometry": null },
{ "type": "Feature", "properties": { "final_zone": "Protection", "id": 6516, "ids": "{24920,24943}", "sub": "{NULL,NULL}" }, "geometry": null },
{ "type": "Feature", "properties": { "final_zone": "Protection", "id": 6524, "ids": "{24912,24920,24943,24971,24944}", "sub": "{NULL,NULL,NULL,NULL,NULL}" }, "geometry": null },
{ "type": "Feature", "properties": { "final_zone": "Protection", "id": 6528, "ids": "{24943,24958,24944}", "sub": "{NULL,NULL,NULL}" }, "geometry": null },
{ "type": "Feature", "properties": { "final_zone": "Protection", "id": 6610, "ids": "{24943,24971}", "sub": "{NULL,NULL}" }, "geometry": null },
{ "type": "Feature", "properties": { "final_zone": "Protection", "id": 6781, "ids": "{24912,24906,24943}", "sub": "{NULL,NULL,NULL}" }, "geometry": null }
]
}
In this particular instance 24943 is present in all 5 rows. how do I collapse down a table like this? and aggregate the arrays into 1 smooth array.
Keep in mind there are thousands of other rows so I dont want one massive group by. I want JUST the rows that have the same common ID in the ids array to be collapse down
I can do this
with abid as(
select regexp_replace(id,'[{}]','','gi')::int id from(
select unnest(ids) id from(
select string_to_array(ids,',') ids from conflict.test
)t
)t2
),
abid2 as(select ids::int[] id from conflict.test
)
select t2.*,t.* from abid t,abid2 t2 where t.id =any(t2.id)
just to give a little more scope the brown middle piece below has the id of 24943

Is not so clear the result you want, if you just want to unify the array, try this:
with tmp as (
select unnest(ids) ids
from your_table_name
group by unnest(ids)
)
select x.final_zone, x.id, array_agg(t.ids), x.sub, x.polys
from tmp t
cross join your_table_name x
group by x.final_zone, x.id, x.sub, x.polys;

Related

Deserialise multiple objects into a select statment

In a table, I store multiple string records in several records.
declare #x nvarchar(max) = {
"totalSize": 1000,
"done": true,
"records": [
{
"attributes": {
"type": "Contract",
"url": ""
},
"Name": "Harpy",
"Job_Schedule_Date__c": null,
"EndDate": "2021-03-24",
"Account": {
"attributes": {
"type": "Account",
"url": ""
},
"Name": "Madison"
},
"ContractNumber": "12345",
"Related_Site__r": {
"attributes": {
"type": "Site__c",
"url": ""
},
"Name": "Jackson"
}
},
.
.
.
]
}
select * from openJson(#x, '$.records')
I am trying to use open JSON to unpack the records.
I am able to unpack a single record, but it doesn't unpack them into columns and need to unpack multiple records and join them.
Since each record only stores 1000 records, I need to join them up.
What I want is output like below as a Select
Name, Job_Schedule_Date__c, EndDate, AccountName, ContractNumber, RelatedSiteName
Harpy, null, 2021-03-24, Madison, 12345, Jackson

Is there a way to stop nested columns taking NULL values in BigQuery when the higher-level parent column is NULL?

I am trying to query a BigQuery table which has nested values. All the elements of the table are NULLABLE and the data contains NULL values. The issue is when I run the query, the nested values all take the value of either the STRING value or NULL. Instead, I would like the nested values to take the value of STRING if there is a value, otherwise the higher-level valuesField takes the value of NULL.
The database has the following schema:
{
"name": "id",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "fields",
"type": "RECORD",
"mode": "NULLABLE",
"fields": [{
"name": "valuesFiled",
"type": "RECORD",
"mode": "NULLABLE",
"fields": [{
"name": "value1",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "value2",
"type": "STRING",
"mode": "NULLABLE"
}]
}]
}
This is the query I am running:
SELECT
id,
STRUCT(
CASE fields.valuesFiled
WHEN NULL THEN NULL
ELSE STRUCT(
fields.valuesFiled.value1,
fields.valuesFiled.value2
)
END AS values
) AS fields
FROM tableName;
An example of the JSON output I get:
{
"id": "1"
"fields": {
"values": {
"value1": "stringValue1",
"value2": "stringValue2"
}
}
}, {
"id": "2"
"fields": {
"values": {
"value1": null,
"value2": null
}
}
}
An example of the JSON output I would like:
{
"id": "1"
"fields": {
"values": {
"value1": "stringValue1",
"value2": "stringValue2"
}
}
}, {
"id": "2"
"fields": {
"values": null
}
}
Does anyone know if this is possible with BigQuery, and if so, how it can be achieved?
Try this query:
SELECT
id,
STRUCT(
CASE WHEN fields.valuesFiled.value1 IS NULL AND fields.valuesFiled.value2 IS NULL THEN NULL
ELSE STRUCT(
fields.valuesFiled.value1,
fields.valuesFiled.value2
)
END AS values
) AS fields
FROM tableName;

select node value from json column type

A table I called raw_data with three columns: ID, timestamp, payload, the column paylod is a json type having values such as:
{
"data": {
"author_id": "1461871206425108480",
"created_at": "2022-08-17T23:19:14.000Z",
"geo": {
"coordinates": {
"type": "Point",
"coordinates": [
-0.1094,
51.5141
]
},
"place_id": "3eb2c704fe8a50cb"
},
"id": "1560043605762392066",
"text": " ALWAYS # London, United Kingdom"
},
"matching_rules": [
{
"id": "1560042248007458817",
"tag": "london-paris"
}
]
}
From this I want to select rows where the coordinates is available, such as [-0.1094,51.5141]in this case.
SELECT *
FROM raw_data, json_each(payload)
WHERE json_extract(json_each.value, '$.data.geo.') IS NOT NULL
LIMIT 20;
Nothing was returned.
EDIT
NOT ALL json objects have the coordinates node. For example this value:
{
"data": {
"author_id": "1556031969062010881",
"created_at": "2022-08-18T01:42:21.000Z",
"geo": {
"place_id": "006c6743642cb09c"
},
"id": "1560079621017796609",
"text": "Dear Desperate sister say husband no dey oo."
},
"matching_rules": [
{
"id": "1560077018183630848",
"tag": "kaduna-kano-katsina-dutse-zaria"
}
]
}
The correct path is '$.data.geo.coordinates.coordinates' and there is no need for json_each():
SELECT *
FROM raw_data
WHERE json_extract(payload, '$.data.geo.coordinates.coordinates') IS NOT NULL;
See the demo.

Query Druid SQL inner join with a dataSource name that has a dash

How to write an INNER JOIN query between two data sources that one of them has a dash as it's schema name
Executing the following query on the Druid SQL binary results in a query error
SELECT *
FROM first
INNER JOIN "second-schema" on first.device_id = "second-schema".device_id;
org.apache.druid.java.util.common.ISE: Cannot build plan for query
Is this the correct syntax when trying to refrence a data source that has a dash in it's name?
Schema
[
{
"dataSchema": {
"dataSource": "second-schema",
"parser": {
"type": "string",
"parseSpec": {
"format": "json",
"timestampSpec": {
"column": "ts_start"
},
"dimensionsSpec": {
"dimensions": [
"etid",
"device_id",
"device_name",
"x_1",
"x_2",
"x_3",
"vlan",
"s_x",
"d_x",
"d_p",
"msg_type"
],
"dimensionExclusions": [],
"spatialDimensions": []
}
}
},
"metricsSpec": [
{ "type": "hyperUnique", "name": "conn_id_hll", "fieldName": "conn_id"},
{
"type": "count",
"name": "event_count"
}
],
"granularitySpec": {
"type": "uniform",
"segmentGranularity": "HOUR",
"queryGranularity": "minute"
}
},
"ioConfig": {
"type": "realtime",
"firehose": {
"type": "kafka-0.8",
"consumerProps": {
"zookeeper.connect": "localhost:2181",
"zookeeper.connectiontimeout.ms": "15000",
"zookeeper.sessiontimeout.ms": "15000",
"zookeeper.synctime.ms": "5000",
"group.id": "flow-info",
"fetch.size": "1048586",
"autooffset.reset": "largest",
"autocommit.enable": "false"
},
"feed": "flow-info"
},
"plumber": {
"type": "realtime"
}
},
"tuningConfig": {
"type": "realtime",
"maxRowsInMemory": 50000,
"basePersistDirectory": "\/opt\/druid-data\/realtime\/basePersist",
"intermediatePersistPeriod": "PT10m",
"windowPeriod": "PT15m",
"rejectionPolicy": {
"type": "serverTime"
}
}
},
{
"dataSchema": {
"dataSource": "first",
"parser": {
"type": "string",
"parseSpec": {
"format": "json",
"timestampSpec": {
"column": "ts_start"
},
"dimensionsSpec": {
"dimensions": [
"etid",
"category",
"device_id",
"device_name",
"severity",
"x_2",
"x_3",
"x_4",
"x_5",
"vlan",
"s_x",
"d_x",
"s_i",
"d_i",
"d_p",
"id"
],
"dimensionExclusions": [],
"spatialDimensions": []
}
}
},
"metricsSpec": [
{ "type": "doubleSum", "name": "val_num", "fieldName": "val_num" },
{ "type": "doubleMin", "name": "val_num_min", "fieldName": "val_num" },
{ "type": "doubleMax", "name": "val_num_max", "fieldName": "val_num" },
{ "type": "doubleSum", "name": "size", "fieldName": "size" },
{ "type": "doubleMin", "name": "size_min", "fieldName": "size" },
{ "type": "doubleMax", "name": "size_max", "fieldName": "size" },
{ "type": "count", "name": "first_count" }
],
"granularitySpec": {
"type": "uniform",
"segmentGranularity": "HOUR",
"queryGranularity": "minute"
}
},
"ioConfig": {
"type": "realtime",
"firehose": {
"type": "kafka-0.8",
"consumerProps": {
"zookeeper.connect": "localhost:2181",
"zookeeper.connectiontimeout.ms": "15000",
"zookeeper.sessiontimeout.ms": "15000",
"zookeeper.synctime.ms": "5000",
"group.id": "first",
"fetch.size": "1048586",
"autooffset.reset": "largest",
"autocommit.enable": "false"
},
"feed": "first"
},
"plumber": {
"type": "realtime"
}
},
"tuningConfig": {
"type": "realtime",
"maxRowsInMemory": 50000,
"basePersistDirectory": "\/opt\/druid-data\/realtime\/basePersist",
"intermediatePersistPeriod": "PT10m",
"windowPeriod": "PT15m",
"rejectionPolicy": {
"type": "serverTime"
}
}
}
]
Based on your schema definitions there are a few observations I'll make.
When doing a join you usually have to list out columns explicitly (not use a *) otherwise you get collisions from duplicate columns. In your join, for example, you have a device_id in both "first" and "second-schema", not to mention all the other columns that are the same across both.
When using a literal delimiter I don't mix them up. I either use them or I don't.
So I think your query will work better in the form of something more like this
SELECT
"first"."etid",
"first"."category",
"first"."device_id",
"first"."device_name",
"first"."severity",
"first"."x_2",
"first"."x_3",
"first"."x_4",
"first"."x_5",
"first"."vlan",
"first"."s_x",
"first"."d_x",
"first"."s_i",
"first"."d_i",
"first"."d_p",
"first"."id",
"second-schema"."etid" as "ss_etid",
"second-schema"."device_id" as "ss_device_id",
"second-schema"."device_name" as "ss_device_name",
"second-schema"."x_1" as "ss_x_1",
"second-schema"."x_2" as "ss_x_2",
"second-schema"."x_3" as "ss_x_3",
"second-schema"."vlan" as "ss_vlan",
"second-schema"."s_x" as "ss_s_x",
"second-schema"."d_x" as "ss_d_x",
"second-schema"."d_p" as "ss_d_p",
"second-schema"."msg_type"
FROM "first"
INNER JOIN "second-schema" ON "first"."device_id" = "second-schema"."device_id";
Obviously feel free to name columns as you see fit, or include exclude columns as needed. Select * will only work when all columns across both tables are unique.

Big query DML insert-select as into repeated tables

I have a table structure (table name: Recalled_transaction) as follows:
[{
"name": "STR_NBR",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "RGSTR_NBR",
"type": "INTEGER",
"mode": "NULLABLE"
},
{
"name": "POS_TRANS_ID",
"type": "INTEGER",
"mode": "NULLABLE"
},
{
"name": "SLS_DT",
"type": "DATE",
"mode": "NULLABLE"
},
{
"name": "TRANS_ORIG_SRC",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "POS_APPL_TYP_CD",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "USER_ID",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "RECALLED_TXN",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "POS_SEQ_NBR",
"type": "STRING",
"mode": "REPEATED"
},
{
"name": "SUB_SYS_CD",
"type": "STRING",
"mode": "NULLABLE"
}
]
}
]
I would like to insert from a set of regular tables into this structure using insert-select as (DML in standard sql). Has anyone done before. Any help is appreciated.
Thanks
I created a table with the same schema and put together a sample query to insert into it. In your particular case, since you have two tables, you will probably need to JOIN them and then use GROUP BY.
INSERT mydataset.SampleDmlTable
(STR_NBR, RGSTR_NBR, POS_TRANS_ID, SLS_DT, TRANS_ORIG_SRC, RECALLED_TXN)
WITH T AS (
SELECT CAST(x AS STRING) AS STR_NBR,
10 - x AS RGSTR_NBR,
x AS POS_TRANS_ID,
DATE_SUB(CURRENT_DATE(), INTERVAL x DAY) AS SLS_DT,
CONCAT('foo_', CAST(x AS STRING)) AS POS_APPL_TYP_CD,
CAST(x AS STRING) AS USER_ID,
[CONCAT('bar_', CAST(x AS STRING)), 'baz'] AS POS_SEQ_NBR,
CAST(10 - x AS STRING) AS SUB_SYS_CD
FROM UNNEST([1, 1, 0, 3, 2, 2, 2]) AS x
)
SELECT
STR_NBR,
RGSTR_NBR,
POS_TRANS_ID,
SLS_DT,
ARRAY_AGG(STRUCT(POS_APPL_TYP_CD, USER_ID)) AS TRANS_ORIG_SRC,
ARRAY_AGG(STRUCT(POS_SEQ_NBR, SUB_SYS_CD)) AS RECALLED_TXN
FROM T
GROUP BY 1, 2, 3, 4;