How to delete/update nested data in bigquery - google-bigquery

Is there a way to delete/update nested field in bigquery?
Let's say I have this data
wives.age wives.name name
21 angel adam
20 kale
21 victoria rossi
20 jessica
or in json:
{"name":"adam","wives":[{"name":"angel","age":21},{"name":"kale","age":20}]}
{"name":"rossi","wives":[{"name":"victoria","age":21},{"name":"jessica","age":20}]}
As you can see from the data above.
Adam has 2 wives, named angel and kale. How to:
Delete kale record.
Update jessica to dessica
I tried to google this, but can't find it. I also tried to unnest, etc but no luck.
The reason why we want to do this is because we insert the array to the wrong records and want to remove/update array data with some condition.

Below is for BigQuery Standard SQL
#standardSQL
WITH updates AS (
SELECT 'rossi' name, 'jessica' oldname, 'dessica' newname UNION ALL
SELECT 'rossi' name, 'victoria' oldname, 'polly' newname UNION ALL
SELECT 'adam' name, 'angel' oldname, 'jen' newname
), divorces AS (
SELECT 'adam' name, 'kale' wifename UNION ALL
SELECT 'adam' name, 'milly' wifename UNION ALL
SELECT 'rossi' name, 'linda' wifename
)
SELECT t.name,
ARRAY(
SELECT AS STRUCT
age,
CASE
WHEN NOT oldname IS NULL THEN newname
ELSE name
END name
FROM UNNEST(wives)
LEFT JOIN UNNEST(updates) ON t.name = u.name AND name = oldname
LEFT JOIN UNNEST(divorces) AS wifename ON t.name = d.name AND name = wifename
WHERE wifename IS NULL
) waves
FROM `project.dataset.table` t
LEFT JOIN (
SELECT name, ARRAY_AGG(STRUCT(oldname, newname)) updates
FROM updates GROUP BY name
) u ON t.name = u.name
LEFT JOIN (
SELECT name, ARRAY_AGG(wifename) divorces
FROM divorces GROUP BY name
) d ON t.name = d.name
You can test / play with above using dummy data as below
#standardSQL
WITH `project.dataset.table` AS (
SELECT 'adam' name, [STRUCT<age INT64, name STRING>(21, 'angel'), (20, 'kale'), (22, 'milly')] wives UNION ALL
SELECT 'rossi', [STRUCT<age INT64, name STRING>(21, 'victoria'), (20, 'jessica'), (23, 'linda')]
), updates AS (
SELECT 'rossi' name, 'jessica' oldname, 'dessica' newname UNION ALL
SELECT 'rossi' name, 'victoria' oldname, 'polly' newname UNION ALL
SELECT 'adam' name, 'angel' oldname, 'jen' newname
), divorces AS (
SELECT 'adam' name, 'kale' wifename UNION ALL
SELECT 'adam' name, 'milly' wifename UNION ALL
SELECT 'rossi' name, 'linda' wifename
)
SELECT t.name,
ARRAY(
SELECT AS STRUCT
age,
CASE
WHEN NOT oldname IS NULL THEN newname
ELSE name
END name
FROM UNNEST(wives)
LEFT JOIN UNNEST(updates) ON t.name = u.name AND name = oldname
LEFT JOIN UNNEST(divorces) AS wifename ON t.name = d.name AND name = wifename
WHERE wifename IS NULL
) waves
FROM `project.dataset.table` t
LEFT JOIN (
SELECT name, ARRAY_AGG(STRUCT(oldname, newname)) updates
FROM updates GROUP BY name
) u ON t.name = u.name
LEFT JOIN (
SELECT name, ARRAY_AGG(wifename) divorces
FROM divorces GROUP BY name
) d ON t.name = d.name
result is as expected
name waves.age waves.name
adam 21 jen
rossi 21 polly
20 dessica
I hope you will be able to apply above to your real case :o)

Related

How to dedup array_agg in bigquery

I created a new table with repeating records with duplicates.
I am trying to find the most efficient way to deduplicate records as this will be run
on a table with millions of records.
If you using multiple CTE's nested does it matter what your data structure is the processing is done in memory or does it write to temp tables when there is a lot of data.
create or replace table t1.cte4 as
WITH t1 AS (
SELECT 1 as id,'eren' AS last_name UNION ALL
SELECT 1 as id,'yilmaz' AS last_name UNION ALL
SELECT 1 as id,'kaya' AS last_name UNION ALL
SELECT 1 as id,'kaya' AS last_name UNION ALL
SELECT 2 as id,'smith' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'brown' AS last_name
)
SELECT id,ARRAY_AGG(STRUCT(last_name)) AS last_name_rec
FROM t1
GROUP BY id;
I can remove duplicates as follows.
QUERY 1 How to dedup the concat_struct ?
select id,
STRING_AGG( distinct ln.last_name ,'~') as concat_string,
ARRAY_AGG(STRUCT( ln.last_name )) as concat_struct
from `t1.cte4`, unnest(last_name_rec) ln
group by id;
QUERY 1
QUERY 2 Is there a better way then this to dedup?
select distinct id,
TO_JSON_STRING(ARRAY_AGG(ln.last_name) OVER (PARTITION BY id)) json_string
from `t1.cte4`, unnest(last_name_rec) ln
group by id,
ln.last_name;
QUERY 2
How do I get it out of the table as distinct rather then using the CTE. This does not dedup.
select id, ARRAY_AGG(STRUCT( ln.last_name )) as concat_struct
from t1.cte4,
unnest(last_name_rec) ln group by id;
I can't do this.
select id, ARRAY_AGG(distinct STRUCT( ln.last_name )) as concat_struct from t1.cte4,
unnest(last_name_rec) ln group by id;
UPDATE: Decompose the struct before deduplication and then compose it back:
select id, ARRAY_AGG(STRUCT(last_name)) as concat_struct
from (
select id, ln.last_name
from cte4, unnest(last_name_rec) ln
group by id, ln.last_name
) d
group by id
(original answer based on unwanted change of table definition follows)
Just use array_agg(distinct ...):
WITH t1 AS (
SELECT 1 as id,'eren' AS last_name UNION ALL
SELECT 1 as id,'yilmaz' AS last_name UNION ALL
SELECT 1 as id,'kaya' AS last_name UNION ALL
SELECT 1 as id,'kaya' AS last_name UNION ALL
SELECT 2 as id,'smith' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'brown' AS last_name
)
SELECT id,ARRAY_AGG(distinct last_name) AS last_name_rec
FROM t1
GROUP BY id;

Query for Multiple row SQL Where clause

Need help for one query which is fetching result from multiple rows based on some condition. For e.g. we have table with [Roll no] with [subjects]. Table can have multiple records for the same [Roll No]. My requirement is if the Student opt for only 'English' then result should return 'E', if Maths then 'M' and if both then 'B'.
// I think this is what you want.
INSERT INTO dbo.rolls
( name, subject )
VALUES ( 'Jones', 'English'),
( 'Smith', 'Math'),
('Adams','English'),
('Adams', 'Math')
GO
;WITH CTE AS (
SELECT subquery1.name, 'B' AS code FROM (
SELECT name,COUNT(name) AS cnt
FROM rolls
WHERE subject = 'English' OR subject = 'Math'
GROUP BY name
HAVING COUNT(name) > 1 ) AS subquery1
UNION
SELECT subquery2.name, SUBSTRING(rolls.subject,1,1) AS code FROM (
SELECT name,COUNT(name) AS cnt
FROM rolls
WHERE subject = 'English' OR subject = 'Math'
GROUP BY name
HAVING COUNT(name) = 1 ) AS subquery2
INNER JOIN dbo.rolls
ON rolls.name = subquery2.name
)
SELECT * FROM CTE

Union null in Oracle

I'm trying to combine 2 querys in oracle, those lines have the same value expect one field.
Ex:
SELECT NAME, AGE, EMAIL, DATE FROM table_a WHERE NAME = 'JOAO' AND FLAG = '0'
UNION
SELECT NAME, AGE, EMAIL, DATE FROM table_a WHERE NAME = 'JOAO' AND FLAG = '1'
Result:
NAME AGE EMAIL DATE
JOAO 23 a#a.com 20150414
JOAO 23 a#a.com null
How i can group this lines?? I'm looking for something who can give me something like this result:
NAME AGE EMAIL DATE
JOAO 23 a#a.com 20150414
Thank you
(sorry for my english..)
You can use COALESCE(). http://docs.oracle.com/cd/B28359_01/server.111/b28286/functions023.htm#SQLRF00617/ms190349.aspx
This Query should work for every name, and should coalesce the other rows.
SELECT
NAME1 AS NAME,
COALESCE(AGE1, AGE2) AS AGE,
COALESCE(EMAIL1, EMAIL2) AS EMAIL,
COALESCE(DATE1, DATE2) AS DATE
FROM(
SELECT
t1.NAME AS NAME1,
t1.AGE AS AGE1,
t1.EMAIL AS EMAIL1,
t1.DATE AS DATE1,
t2.NAME AS NAME2,
t2.AGE AS AGE2,
t2.EMAIL AS EMAIL2,
t2.DATE AS DATE2
FROM table_a AS t1
INNER JOIN table_a AS t2
ON t2.FLAG = 1 AND t1.FLAG = 0 AND t1.NAME = t2.NAME
) AS t3;
It would be the case to change your conditions in the 'WHERE' clause from
WHERE NAME = 'JOAO' AND FLAG = '0'
to
WHERE NAME = 'JOAO' AND FLAG IN('0','1')
so your selection shows all the registers with either '0' or '1' values for the column 'FLAG'.
If you're just trying to ignore the NULL values:
SELECT NAME, AGE, EMAIL, DATE
FROM table_a
WHERE NAME = 'JOAO'
AND FLAG in ( '0', '1' )
and date is not null
/
or
if you want to keep the nulls, but defer to available non-null values:
with w_data as (
SELECT NAME, AGE, EMAIL, DATE ,
row_number() over ( partition by name
order by date desc nulls last ) rnum
FROM table_a
WHERE NAME = 'JOAO'
AND FLAG in ( '0', '1' )
)
select name, age, email, date
from w_data
where rnum = 1
/
[edit]
in response to comment:
If you want to keep union, that's fine, union and OR are mostly interchangeable (in this case):
SELECT NAME, AGE, EMAIL, DATE
FROM table_a
WHERE NAME = 'JOAO'
AND FLAG = '0'
and date is not null
union
SELECT NAME, AGE, EMAIL, DATE
FROM table_a
WHERE NAME = 'JOAO'
AND FLAG = '1'
and date is not null
/
or even:
select * from (
SELECT NAME, AGE, EMAIL, DATE
FROM table_a
WHERE NAME = 'JOAO'
AND FLAG = '0'
union
SELECT NAME, AGE, EMAIL, DATE
FROM table_a
WHERE NAME = 'JOAO'
AND FLAG = '1'
)
where date is not null
/
if you just want the NULL criteria in 1 place ;)
Same logic can apply to the 2nd query I wrote above using row_number() analytics ...
with w_data as (
SELECT NAME, AGE, EMAIL, DATE
FROM table_a
WHERE NAME = 'JOAO'
AND FLAG = '0'
union
SELECT NAME, AGE, EMAIL, DATE
FROM table_a
WHERE NAME = 'JOAO'
AND FLAG = '1'
),
w_sub as (
select name, age, email, date,
row_number() over ( partition by name
order by date desc nulls last ) rnum
from w_data
)
select name, age, email, date
from w_sub
where rnum = 1
/

Select literals as table data in SQL server

Consider table Address , with fields Country, State, and other data fields. I want to get all the records except for those with Country,State combination as (US, IL), (US,LA), (IND,DEL)
The query goes like
Select * from Address a
where not exists
(
select Country,State
(select 'US' as Country, 'IL' as State
union
select 'US' as Country, 'LA' as State
union
select 'IND' as Country, 'DEL' as State
) e
where e.Country != a.Country and e.State != a.state
)
How can it be easily achieved (to replace coutry,state combination of union with simple subquery)? As total data is not very large, i am least bothered about performance for now.
I know i can create table variable, add all literal combination there using insert into syntax, and use table variable for not exists, but i feel it is overkill for small requirement (not exists on 2 variables).
Looks like your query tried to do this:
select *
from Address a
where not exists (
select *
from (
select 'US' as Country, 'IL' as State union all
select 'US' as Country, 'LA' as State union all
select 'IND' as Country, 'DEL' as State
) e
where e.Country = a.Country and
e.State = a.State
)
Or you could not use a derived table and still get the same result
select *
from Address as a
where not (
a.Country = 'US' and a.State = 'IL' or
a.Country = 'US' and a.State = 'LA' or
a.Country = 'IND' and a.State = 'DEL'
)
Simply use the values directly in the query:
-- Sample data.
declare #Table as Table ( Country VarChar(6), State VarChar(6), Foo VarChar(6) );
insert into #Table ( Country, State, Foo ) values
( 'US', 'IL', 'one' ), ( 'XX', 'LA', 'two' ), ( 'IND', 'XXX', 'three' ), ( 'IND', 'DEL', 'four' );
select * from #Table;
-- Demonstrate excluding specific combinations.
select T.*
from #Table as T left outer join
( values ( 'US', 'IL' ), ( 'US', 'LA' ), ( 'IND', 'DEL' ) ) as Exclude( Country, State )
on T.Country = Exclude.Country and T.State = Exclude.State
where Exclude.Country is NULL;
or
select *
from Address a
left outer join
( select 'US' as Country, 'IL' as State
union select 'US', 'LA'
union select 'IND', 'DEL' ) as n
on a.Country = n.Country and a.State = n.State
where n.Country is NULL;

How do I combine data from multiple rows into one?

I’m using SQL Server 2008. I have data as in this table:
Team Email Groups
------- ------------------ ------
|Team1|-|email0#email.com|-|A|
|Team1|-|email1#email.com|-|B|
|Team1|-|email2#email.com|-|C|
|Team2|-|email3#email.com|-|A|
|Team2|-|email4#email.com|-|B|
|Team2|-|email5#email.com|-|C|
I want to get the data in this format:
Team A B C
------- ------------------ ------------------ ------------------
|Team1|-|email0#email.com|-|email1#email.com|-|email2#email.com|
|Team2|-|email3#email.com|-|email4#email.com|-|email5#email.com|
How can I achieve this?
Using PIVOT You can do the following
With SampleData AS
(
SELECT 'Team1' as Team , 'email0#email.com' as email, 'A' as Groups
UNION SELECT 'Team1' as Team , 'email1#email.com' as email, 'B' as Groups
UNION SELECT 'Team1' as Team , 'email2#email.com' as email, 'C' as Groups
UNION SELECT 'Team2' as Team , 'email3#email.com' as email, 'A' as Groups
UNION SELECT 'Team2' as Team , 'email4#email.com' as email, 'B' as Groups
UNION SELECT 'Team2' as Team , 'email5#email.com' as email, 'C' as Groups
)
SELECT Team, A, B,C FROM
(SELECT * FROM SampleData) source
PIVOT
(MAX(email) FOR Groups IN ([A], [B], [C]) )as pvt
Produces
Team A B C
----- ---------------- ---------------- ----------------
Team1 email0#email.com email1#email.com email2#email.com
Team2 email3#email.com email4#email.com email5#email.com
See a working Data.SE example
In a DB that doesn't support PIVOT you can instead do multiple joins to your table. Although you may want to anyway, since as GBN pointed out, since we're not using an aggregate.
With SampleData AS
(
SELECT 'Team1' as Team , 'email0#email.com' as email, 'A' as Groups
UNION SELECT 'Team1' as Team , 'email1#email.com' as email, 'B' as Groups
UNION SELECT 'Team1' as Team , 'email2#email.com' as email, 'C' as Groups
UNION SELECT 'Team2' as Team , 'email3#email.com' as email, 'A' as Groups
UNION SELECT 'Team2' as Team , 'email4#email.com' as email, 'B' as Groups
UNION SELECT 'Team2' as Team , 'email5#email.com' as email, 'C' as Groups
)
SELECT
source.Team,
A.email,
B.email,
C.email
FROM
(SELECT DISTINCT TEAM From SampleData) source
LEFT JOIN SampleData A
ON source.Team = A.Team
AND A.GROUPS = 'A'
LEFT JOIN SampleData B
ON source.Team = B.Team
AND B.GROUPS = 'B'
LEFT JOIN SampleData C
ON source.Team = C.Team
AND C.GROUPS = 'C'
See a working Data.SE example