Postgres sum values from json only if pair of values exist - sql

My json field looks something like this
{
"Ui": [
{
"element": "TG1",
"mention": "in",
"time": 123
},
{
"element": "TG1",
"mention": "out",
"time": 125
},
{ "element": "TG2",
"mention": "in",
"time": 251
},
{
"element": "TG2",
"mention": "out",
"time": 259
},
{ "element": "TG2",
"mention": "in",
"time": 251
}
]
}
I am trying to get the sum of difference of time per element which is as below
| element | Timespent |
| TG1 | 2 |
| TG2 | 8 |
The problem is ideally for every "in" element there should be an "out" element which is clearly not the case in the above example. I want to only calculate the difference of this pair of values and ignore any value that doesn't have a corresponding out to a in. How can I do that?
Below is what I am using to get the time difference
select element, sum(time) as time_spent
from my_table
cross join lateral (
select
value->>'element' as element,
case value->>'mention' when 'in' then -(value->>'time')::numeric else (value->>'time')::numeric end as time
from json_array_elements(json_column->'Ui')) as elements
group by 1
order by 1

I was not sure about you json_column attribute - you need to group by it in order to not mix values between rows, so I included it into window aggregations in CTE part. But you don't have it in your results, so I skipped it in final qry as well. In short - you can check if order number is even and equal to max order number and then just skip it:
select json_column
, e->>'element' element
, case when
mod(lag(i) over (partition by json_column::text ,e->>'element' order by i),2) = 0
and
max(i) over (partition by json_column::text ,e->>'element') = i
then true else false end "skip"
, case when e->>'mention' = 'in' then -(e->>'time')::int else (e->>'time')::int end times
from my_table, json_array_elements(json_column->'Ui') with ordinality o(e,i)
)
select element, sum (times)
from logic
where not skip
group by element
;
element | sum
---------+-----
TG1 | 2
TG2 | 8
(2 rows)

Related

is there a way to extract duplicated row value in sql as the key/grouping value?

I have following two tables
users
id | name
1 | john
2 | ada
events
id | content | userId
1 | 'applied' | 1
2 | 'interviewed| 1
What would be the query that returns data in the following shape:
[
{name:'john', events:[{id:1, content:'applied'},{id:2, content:'interviewed'}]}
]
I have tried to run following queries
attempt 1
select events.id, content, users.name
from events
left join users
on users.id=events.userId
where events.userId = ?
but it return duplicated value for the name as following
[
{
"id": 1,
"content": "ronaldo",
"name": "Norman Zboncak"
},
{
"id": 2,
"content": "messi",
"name": "Norman Zboncak"
},
{
"id": 3,
"content": "messi",
"name": "Norman Zboncak"
}
]
attempt 2
I tried to use group_concat but apparently you cannot pas multiple arguments into it so couldn't get the result in the desired shape
You must do a LEFT join of users to events and aggregate with SQLite's JSON Functions:
SELECT json_object(
'name', u.name,
'events', json_group_array(json_object('id', e.id, 'content', e.content))
) result
FROM users u LEFT JOIN events e
ON e.userId = u.id
WHERE u.id = 1 -- remove this line to get results for all users
GROUP BY u.id;
See the demo.

How can I modify all values that match a condition inside a json array?

I have a table which has a JSON column called people like this:
Id
people
1
[{ "id": 6 }, { "id": 5 }, { "id": 3 }]
2
[{ "id": 2 }, { "id": 3 }, { "id": 1 }]
...and I need to update the people column and put a 0 in the path $[*].id where id = 3, so after executing the query, the table should end like this:
Id
people
1
[{ "id": 6 }, { "id": 5 }, { "id": 0 }]
2
[{ "id": 2 }, { "id": 0 }, { "id": 1 }]
There may be more than one match per row.
Honestly, I didnĀ“t tried any query since I cannot figure out how can I loop inside a field, but my idea was something like this:
UPDATE mytable
SET people = JSON_SET(people, '$[*].id', 0)
WHERE /* ...something should go here */
This is my version
SELECT VERSION()
+-----------------+
| version() |
+-----------------+
| 10.4.22-MariaDB |
+-----------------+
If the id values in people are unique, you can use a combination of JSON_SEARCH and JSON_REPLACE to change the values:
UPDATE mytable
SET people = JSON_REPLACE(people, JSON_UNQUOTE(JSON_SEARCH(people, 'one', 3)), 0)
WHERE JSON_SEARCH(people, 'one', 3) IS NOT NULL
Note that the WHERE clause is necessary to prevent the query replacing values with NULL when the value is not found due to JSON_SEARCH returning NULL (which then causes JSON_REPLACE to return NULL as well).
If the id values are not unique, you will have to rely on string replacement, preferably using REGEXP_REPLACE to deal with possible differences in spacing in the values (and also avoiding replacing 3 in (for example) 23 or 34:
UPDATE mytable
SET people = REGEXP_REPLACE(people, '("id"\\s*:\\s*)2\\b', '\\14')
Demo on dbfiddle
As stated in the official documentation, MySQL stores JSON-format strings in a string column, for this reason you can either use the JSON_SET function or any string function.
For your specific task, applying the REPLACE string function may suit your case:
UPDATE
mytable
SET
people = REPLACE(people, CONCAT('"id": ', 3, ' '), CONCAT('"id": ',0, ' '))
WHERE
....;

SQL select from array in JSON

I have a table with a json field with the following json:
[
{
productId: '1',
other : [
otherId: '2'
]
},
{
productId: '3',
other : [
otherId: '4'
]
}
]
I am trying to select the productId and otherId for every array element like this:
select JSON_EXTRACT(items, $.items[].productId) from order;
But this is completely wrong since it takes only the first element in the array
Do I need to write a loop or something?
First of all, the data you show is not valid JSON. It has multiple mistakes that make it invalid.
Here's a demo using valid JSON:
mysql> create table orders ( items json );
mysql> insert into orders set items = '[ { "productId": "1", "other": { "otherId": "2" } }, { "productId": "3", "other" : { "otherId": "4" } } ]'
mysql> SELECT JSON_EXTRACT(items, '$[*].productId') AS productIds FROM orders;
+------------+
| productIds |
+------------+
| ["1", "3"] |
+------------+
If you want each productId on a row by itself as a scalar value instead of a JSON array, you'd have to use JSON_TABLE() in MySQL 8.0:
mysql> SELECT j.* FROM orders CROSS JOIN JSON_TABLE(items, '$[*]' COLUMNS(productId INT PATH '$.productId')) AS j;
+-----------+
| productId |
+-----------+
| 1 |
| 3 |
+-----------+
This is tested in MySQL 8.0.23.
You also tagged your question MariaDB. I don't use MariaDB, and MariaDB has its own incompatible implementation of JSON support, so I can't predict how it will work.

In BigQuery, how do I check if two ARRAY of STRUCTs are equal

I have a query that outputs two array of structs:
SELECT modelId, oldClassCounts, newClassCounts
FROM `xyz`
GROUP BY 1
How do I create another column that is TRUE if oldClassCounts = newClassCounts?
Here is a sample result in JSON:
[
{
"modelId": "FBF21609-65F8-4076-9B22-D6E277F1B36A",
"oldClassCounts": [
{
"id": "A041EBB1-E041-4944-B231-48BC4CCE025B",
"count": "33"
},
{
"id": "B8E4812B-A323-47DD-A6ED-9DF877F501CA",
"count": "82"
}
],
"newClassCounts": [
{
"id": "A041EBB1-E041-4944-B231-48BC4CCE025B",
"count": "33"
},
{
"id": "B8E4812B-A323-47DD-A6ED-9DF877F501CA",
"count": "82"
}
]
}
]
I want the equality column to be TRUE if oldClassCounts and newClassCounts are exactly the same like the output above.
Anything else should be false.
I would go about with this solution
#standardSQL
WITH xyz AS (
SELECT "FBF21609-65F8-4076-9B22-D6E277F1B36A" AS modelId,
[STRUCT("A041EBB1-E041-4944-B231-48BC4CCE025B" as id, "33" as count),
STRUCT("B8E4812B-A323-47DD-A6ED-9DF877F501CA" as id, "82" as count)] AS oldClassCounts,
[STRUCT("A041EBB1-E041-4944-B231-48BC4CCE025B" as id, "33" as count),
STRUCT("B8E4812B-A323-47DD-A6ED-9DF877F501CA" as id, "82" as count)] as newClassCounts),
o as (SELECT modelId, id, count, array_length(oldClassCounts) as len FROM xyz, UNNEST(oldClassCounts) as old_c),
n as (SELECT modelId, id, count, array_length(newClassCounts) as len FROM xyz, UNNEST(newClassCounts) as new_c),
uneq as (select * from o except distinct select * from n)
select xyz.*, IF(uneq.modelId is not null, false, true) as equal from xyz left join (select distinct modelId from uneq) uneq on xyz.modelId = uneq.modelId
It works regardless of the order or having duplicates within the arrays. The idea is that we treat each of the arrays as a separate temporary table removing all elements that exist in one but not the other (using except distinct) and then have an extra check for the length of the arrays in case there are duplicates e.g.
"FBF21609-65F8-4076-9B22-D6E277F1B36A" AS modelId,
[STRUCT("A041EBB1-E041-4944-B231-48BC4CCE025B" as id, "33" as count),
STRUCT("B8E4812B-A323-47DD-A6ED-9DF877F501CA" as id, "82" as count),
STRUCT("B8E4812B-A323-47DD-A6ED-9DF877F501CA" as id, "82" as count)]
I would consider comparing the result of TO_JSON_STRING function applied on both of these arrays.
In the query it would be done in the following way:
SELECT modelId,
oldClassCounts,
newClassCounts,
CASE WHEN TO_JSON_STRING(oldClassCounts) = TO_JSON_STRING(newClassCounts)
THEN true
ELSE false
END
FROM `xyz`;
I'm not sure about GROUP BY 1 part, because non of the fields are grouped or aggregated.
It is not going to work, if the order of elements in the array is going to be different. This solution is not perfect, but worked for the data you provided.

Create a table with multiple nesting levels from flattened structure

A bigquery challenge :
input
I have a table with incoming product batches that go into the factory and multiple sensors along the way measure different defects of different parts of the individual products. We are reading out the data from the devices in a flat structure.
The data is written to a incoming table.
Batch_id|Sensor_id|Product_part_id|defect_id|Count_defects|Event_Date
1.......|.1.......|1..............|2........|.5...........|.2018-7-1
1.......|.2.......|1..............|2........|.6...........|.2018-7-1
1.......|.2.......|2..............|3........|.7...........|.2018-7-1
1.......|.3.......|2..............|3........|.8...........|.2018-7-1
1.......|.3.......|2..............|4........|.9...........|.2018-7-1
1.......|.3.......|3..............|5........|.10...........|.2018-7-1
We can do de-duplication on theses tables as the same sensor might spit out the same data multiple times (by mistake or on purpose when the count-defects is updated) based on the last [updated_time]
the problem: transform into multiple nested repeated structs
Now I'm trying to materialize the raw input into fact tables partitioned by Event_Date but for max performance and cheapest storage, I want to achieve the following structure :
Batch_id|Sensor_id|Product_part_id|defect_id|Count_defects|Event_Date
1.......|.1.......|1..............|2........|.5...........|.2018-7-1
........|.2.......|1..............|2........|.6...........|.2018-7-1
........|.........|2..............|3........|.7...........|.2018-7-1
........|.3.......|2..............|3........|.8...........|.2018-7-1
........|.........|...............|4........|.9...........|.2018-7-1
........|.........|3..............|5........|.10..........|.2018-7-1
I cannot do multiple nested ARRAY() calls, it is not allowed and also badly performing as this would take the same base table as input multiple time.
Looking for suggestions on how to tackle this.
Thanks!
I'm using sequencial application of array_agg() + GROUP BY for that, starting with the innermost array. After the first iteration I put the query into a WITH and start over with creating the next array again using array_agg() + GROUP BY.
Performance-wise this approach has the same constraints all GROUP BY queries have - you should avoid skewed group sizes if you can - otherwise it will just take longer because BigQuery has to re-plan resources in the background when it realizes a group takes to much memory. But you can optimize using the query execution plan.
For your example table my result query looks like this:
WITH t AS (
SELECT 1 as batch_id, 1 as sensor_id, 1 as product_part_id, 2 as defect_id, 5 as count_defects, '2018-7-1' as event_date
UNION ALL SELECT 1 as batch_id, 2 as sensor_id, 1 as product_part_id, 2 as defect_id, 6 as count_defects, '2018-7-1' as event_date
UNION ALL SELECT 1 as batch_id, 2 as sensor_id, 2 as product_part_id, 3 as defect_id, 7 as count_defects, '2018-7-1' as event_date
UNION ALL SELECT 1 as batch_id, 3 as sensor_id, 2 as product_part_id, 3 as defect_id, 8 as count_defects, '2018-7-1' as event_date
UNION ALL SELECT 1 as batch_id, 3 as sensor_id, 2 as product_part_id, 4 as defect_id, 9 as count_defects, '2018-7-1' as event_date
UNION ALL SELECT 1 as batch_id, 3 as sensor_id, 3 as product_part_id, 5 as defect_id, 10 as count_defects, '2018-7-1' as event_date
),
defect_nesting as (
SELECT
batch_id,
sensor_id,
product_part_id,
array_agg(STRUCT(defect_id, count_defects, event_date) ORDER BY defect_id) defectInfo
FROM t
GROUP BY 1, 2, 3
),
product_nesting as (
SELECT
batch_id,
sensor_id,
array_agg(STRUCT(product_part_id, defectInfo) ORDER BY product_part_id) productInfo
FROM defect_nesting
GROUP BY 1,2
)
SELECT
batch_id,
array_agg(STRUCT(sensor_id, productInfo) ORDER BY sensor_id) sensorInfo
FROM product_nesting
GROUP BY 1
The resulting JSON:
[
{
"batch_id": "1",
"sensorInfo": [
{
"sensor_id": "1",
"productInfo": [
{
"product_part_id": "1",
"defectInfo": [
{
"defect_id": "2",
"count_defects": "5",
"event_date": "2018-7-1"
}
]
}
]
},
{
"sensor_id": "2",
"productInfo": [
{
"product_part_id": "1",
"defectInfo": [
{
"defect_id": "2",
"count_defects": "6",
"event_date": "2018-7-1"
}
]
},
{
"product_part_id": "2",
"defectInfo": [
{
"defect_id": "3",
"count_defects": "7",
"event_date": "2018-7-1"
}
]
}
]
},
{
"sensor_id": "3",
"productInfo": [
{
"product_part_id": "2",
"defectInfo": [
{
"defect_id": "3",
"count_defects": "8",
"event_date": "2018-7-1"
},
{
"defect_id": "4",
"count_defects": "9",
"event_date": "2018-7-1"
}
]
},
{
"product_part_id": "3",
"defectInfo": [
{
"defect_id": "5",
"count_defects": "10",
"event_date": "2018-7-1"
}
]
}
]
}
]
}
]
Hope that helps!