Big query DML insert-select as into repeated tables - google-bigquery

I have a table structure (table name: Recalled_transaction) as follows:
[{
"name": "STR_NBR",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "RGSTR_NBR",
"type": "INTEGER",
"mode": "NULLABLE"
},
{
"name": "POS_TRANS_ID",
"type": "INTEGER",
"mode": "NULLABLE"
},
{
"name": "SLS_DT",
"type": "DATE",
"mode": "NULLABLE"
},
{
"name": "TRANS_ORIG_SRC",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "POS_APPL_TYP_CD",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "USER_ID",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "RECALLED_TXN",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "POS_SEQ_NBR",
"type": "STRING",
"mode": "REPEATED"
},
{
"name": "SUB_SYS_CD",
"type": "STRING",
"mode": "NULLABLE"
}
]
}
]
I would like to insert from a set of regular tables into this structure using insert-select as (DML in standard sql). Has anyone done before. Any help is appreciated.
Thanks

I created a table with the same schema and put together a sample query to insert into it. In your particular case, since you have two tables, you will probably need to JOIN them and then use GROUP BY.
INSERT mydataset.SampleDmlTable
(STR_NBR, RGSTR_NBR, POS_TRANS_ID, SLS_DT, TRANS_ORIG_SRC, RECALLED_TXN)
WITH T AS (
SELECT CAST(x AS STRING) AS STR_NBR,
10 - x AS RGSTR_NBR,
x AS POS_TRANS_ID,
DATE_SUB(CURRENT_DATE(), INTERVAL x DAY) AS SLS_DT,
CONCAT('foo_', CAST(x AS STRING)) AS POS_APPL_TYP_CD,
CAST(x AS STRING) AS USER_ID,
[CONCAT('bar_', CAST(x AS STRING)), 'baz'] AS POS_SEQ_NBR,
CAST(10 - x AS STRING) AS SUB_SYS_CD
FROM UNNEST([1, 1, 0, 3, 2, 2, 2]) AS x
)
SELECT
STR_NBR,
RGSTR_NBR,
POS_TRANS_ID,
SLS_DT,
ARRAY_AGG(STRUCT(POS_APPL_TYP_CD, USER_ID)) AS TRANS_ORIG_SRC,
ARRAY_AGG(STRUCT(POS_SEQ_NBR, SUB_SYS_CD)) AS RECALLED_TXN
FROM T
GROUP BY 1, 2, 3, 4;

Related

Is there a way to stop nested columns taking NULL values in BigQuery when the higher-level parent column is NULL?

I am trying to query a BigQuery table which has nested values. All the elements of the table are NULLABLE and the data contains NULL values. The issue is when I run the query, the nested values all take the value of either the STRING value or NULL. Instead, I would like the nested values to take the value of STRING if there is a value, otherwise the higher-level valuesField takes the value of NULL.
The database has the following schema:
{
"name": "id",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "fields",
"type": "RECORD",
"mode": "NULLABLE",
"fields": [{
"name": "valuesFiled",
"type": "RECORD",
"mode": "NULLABLE",
"fields": [{
"name": "value1",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "value2",
"type": "STRING",
"mode": "NULLABLE"
}]
}]
}
This is the query I am running:
SELECT
id,
STRUCT(
CASE fields.valuesFiled
WHEN NULL THEN NULL
ELSE STRUCT(
fields.valuesFiled.value1,
fields.valuesFiled.value2
)
END AS values
) AS fields
FROM tableName;
An example of the JSON output I get:
{
"id": "1"
"fields": {
"values": {
"value1": "stringValue1",
"value2": "stringValue2"
}
}
}, {
"id": "2"
"fields": {
"values": {
"value1": null,
"value2": null
}
}
}
An example of the JSON output I would like:
{
"id": "1"
"fields": {
"values": {
"value1": "stringValue1",
"value2": "stringValue2"
}
}
}, {
"id": "2"
"fields": {
"values": null
}
}
Does anyone know if this is possible with BigQuery, and if so, how it can be achieved?
Try this query:
SELECT
id,
STRUCT(
CASE WHEN fields.valuesFiled.value1 IS NULL AND fields.valuesFiled.value2 IS NULL THEN NULL
ELSE STRUCT(
fields.valuesFiled.value1,
fields.valuesFiled.value2
)
END AS values
) AS fields
FROM tableName;

BigQuery select rows with two (or more / less) matches in a repeated field

I am having a schema that looks like:
[
{
"name": "name",
"type": "STRING",
"mode": "REQUIRED"
},
{
"name": "frm",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "c",
"type": "STRING",
"mode": "REQUIRED"
},
{
"name": "n",
"type": "STRING",
"mode": "REQUIRED"
}
]
},
{
"name": "",
"type": "STRING",
"mode": "NULLABLE"
}
]
With a sample record that looks like this:
I am trying to write a query that selects this row when there is a row in frm that matches C = 'X' and another row that has C = 'Z'. Only when both conditions are true, I would love to select the "name" of the parent row. I actually have no clue how I could achieve this. Any suggestions?
E.g. this works, but I am unnesting frm two times, there must a more efficient way I guess.
SELECT name FROM `t2`
WHERE 'X' in UNNEST(frm.c) AND 'Y' in UNNEST(frm.c)
Consider below approach
select name
from your_table t
where 2 = (
select count(distinct c)
from t.frm
where c in ('X', 'Z')
)

Inserting data from one BigQuery table to another returns 0 rows on group by

I am trying to do insert data from one BigQuery table to another by running the query shown below but I get 0 rows in return. However if I take out the Survey column, I get the correct number of rows in return.
Both the nested fields have the same type of schema. I have checked and double checked the column names too but can´t seem to figure out what´s wrong with Survey field.
INSERT INTO destination_table
(
Title, Description, Address, Survey
)
SELECT
Title as Title,
Description as Description,
[STRUCT(
ARRAY_AGG(STRUCT(Address_Instance.Field1, Address_Instance.Field2)) AS Address_Record
)]
as Address,
[STRUCT(
ARRAY_AGG(STRUCT(Survey_Instance.Field1, Survey_Instance.Field2)) AS Survey_Record
)]
as Survey
FROM
source_table,
UNNEST(Survey) AS Survey,
UNNEST(Survey_Instance) as Survey_Instance,
GROUP BY
Title,
Description
Here´s how the schema of my source table looks like:
[
{
"name": "Title",
"type": "STRING"
},
{
"name": "Description",
"type": "STRING"
},
{
"name": "Address",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "Address_Instance",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "Field1",
"type": "STRING"
},
{
"name": "Field2",
"type": "STRING"
}
]
}
]
},
{
"name": "Survey",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "Survey_Instance",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "Field1",
"type": "STRING"
},
{
"name": "Field2",
"type": "STRING"
}
]
}
]
},
]
While mapping to the destination table, I rename the nested repeated records but that´s not causing any problems. I am wondering if I am overlooking something important and need some suggestions and advice. Basically an extra set of eyes to help me figure what I am doing wrong.
Would appreciate some help. Thanks in advance.
Use explicit JOINs in general. In this case, use LEFT JOIN:
FROM source_table st LEFT JOIN
UNNEST(st.Survey) Survey
ON 1=1 LEFT JOIN
UNNEST(Survey.Survey_Instance) Survey_Instance
ON 1=1

Correct way to create "record" field in Avro schema

I am trying to understand Avro schemas and stuck with complex types (record). The problem is very simple: create a schema which contains one record filed with two primitive fields (string and timestamp) nested to record. I see two options for the schema:
option 1
{
"type": "record",
"name": "cool_subject",
"namespace": "com.example",
"fields": [
{
"name": "field_1",
"type": "record"
"fields": [
{"name": "operation", "type": "string"},
{"name": "timestamp", "type": "long", "logical_type": "timestamp_millis"}
]
}
]
}
option 2
{
"type": "record",
"name": "cool_subject",
"namespace": "com.example",
"fields": [
{
"name": "field_1",
"type": {
"type": "record",
"name": "field_1_type",
"fields": [
{"name": "operation", "type": "string"},
{"name": "timestamp", "type": {"type": "long", "logical_type": "timestamp_millis"}}
]
}
}
]
}
The difference is in the "type" attribute.
As far as I know opt2 is the correct way. Am I right? Is opt1 valid?
The second one is correct. The first one is not valid.
A record schema is something that looks like this:
{
"type": "record",
"name": <Name of the record>,
"fields": [...],
}
And for fields, it should be like this:
[
{
"name": <name of field>,
"type": <type of field>,
},
...
]
So in the case of a field which contains a record, it should always look like this:
[
{
"name": <name of field>,
"type": {
"type": "record",
"name": <Name of the record>,
"fields": [...],
}
},
...
]
The format in the first example would make it unclear if the name "field_1" was the name of the field or the name of the record.

postgres group by common id in array on many rows

I have a table like this:
{
"type": "FeatureCollection",
"name": "test",
"crs": { "type": "name", "properties": { "name": "urn:ogc:def:crs:OGC:1.3:CRS84" } },
"features": [
{ "type": "Feature", "properties": { "final_zone": "ECZ", "id": 20753, "ids": "{22210,10959,22209,22213}", "sub": "{ECZECSZ,NULL,ECZECSZ,ECZECSZ}" }, "geometry": null },
{ "type": "Feature", "properties": { "final_zone": "Protection", "id": 6516, "ids": "{24920,24943}", "sub": "{NULL,NULL}" }, "geometry": null },
{ "type": "Feature", "properties": { "final_zone": "Protection", "id": 6524, "ids": "{24912,24920,24943,24971,24944}", "sub": "{NULL,NULL,NULL,NULL,NULL}" }, "geometry": null },
{ "type": "Feature", "properties": { "final_zone": "Protection", "id": 6528, "ids": "{24943,24958,24944}", "sub": "{NULL,NULL,NULL}" }, "geometry": null },
{ "type": "Feature", "properties": { "final_zone": "Protection", "id": 6610, "ids": "{24943,24971}", "sub": "{NULL,NULL}" }, "geometry": null },
{ "type": "Feature", "properties": { "final_zone": "Protection", "id": 6781, "ids": "{24912,24906,24943}", "sub": "{NULL,NULL,NULL}" }, "geometry": null }
]
}
In this particular instance 24943 is present in all 5 rows. how do I collapse down a table like this? and aggregate the arrays into 1 smooth array.
Keep in mind there are thousands of other rows so I dont want one massive group by. I want JUST the rows that have the same common ID in the ids array to be collapse down
I can do this
with abid as(
select regexp_replace(id,'[{}]','','gi')::int id from(
select unnest(ids) id from(
select string_to_array(ids,',') ids from conflict.test
)t
)t2
),
abid2 as(select ids::int[] id from conflict.test
)
select t2.*,t.* from abid t,abid2 t2 where t.id =any(t2.id)
just to give a little more scope the brown middle piece below has the id of 24943
Is not so clear the result you want, if you just want to unify the array, try this:
with tmp as (
select unnest(ids) ids
from your_table_name
group by unnest(ids)
)
select x.final_zone, x.id, array_agg(t.ids), x.sub, x.polys
from tmp t
cross join your_table_name x
group by x.final_zone, x.id, x.sub, x.polys;