How to dedup array_agg in bigquery - sql

I created a new table with repeating records with duplicates.
I am trying to find the most efficient way to deduplicate records as this will be run
on a table with millions of records.
If you using multiple CTE's nested does it matter what your data structure is the processing is done in memory or does it write to temp tables when there is a lot of data.
create or replace table t1.cte4 as
WITH t1 AS (
SELECT 1 as id,'eren' AS last_name UNION ALL
SELECT 1 as id,'yilmaz' AS last_name UNION ALL
SELECT 1 as id,'kaya' AS last_name UNION ALL
SELECT 1 as id,'kaya' AS last_name UNION ALL
SELECT 2 as id,'smith' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'brown' AS last_name
)
SELECT id,ARRAY_AGG(STRUCT(last_name)) AS last_name_rec
FROM t1
GROUP BY id;
I can remove duplicates as follows.
QUERY 1 How to dedup the concat_struct ?
select id,
STRING_AGG( distinct ln.last_name ,'~') as concat_string,
ARRAY_AGG(STRUCT( ln.last_name )) as concat_struct
from `t1.cte4`, unnest(last_name_rec) ln
group by id;
QUERY 1
QUERY 2 Is there a better way then this to dedup?
select distinct id,
TO_JSON_STRING(ARRAY_AGG(ln.last_name) OVER (PARTITION BY id)) json_string
from `t1.cte4`, unnest(last_name_rec) ln
group by id,
ln.last_name;
QUERY 2
How do I get it out of the table as distinct rather then using the CTE. This does not dedup.
select id, ARRAY_AGG(STRUCT( ln.last_name )) as concat_struct
from t1.cte4,
unnest(last_name_rec) ln group by id;
I can't do this.
select id, ARRAY_AGG(distinct STRUCT( ln.last_name )) as concat_struct from t1.cte4,
unnest(last_name_rec) ln group by id;

UPDATE: Decompose the struct before deduplication and then compose it back:
select id, ARRAY_AGG(STRUCT(last_name)) as concat_struct
from (
select id, ln.last_name
from cte4, unnest(last_name_rec) ln
group by id, ln.last_name
) d
group by id
(original answer based on unwanted change of table definition follows)
Just use array_agg(distinct ...):
WITH t1 AS (
SELECT 1 as id,'eren' AS last_name UNION ALL
SELECT 1 as id,'yilmaz' AS last_name UNION ALL
SELECT 1 as id,'kaya' AS last_name UNION ALL
SELECT 1 as id,'kaya' AS last_name UNION ALL
SELECT 2 as id,'smith' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'brown' AS last_name
)
SELECT id,ARRAY_AGG(distinct last_name) AS last_name_rec
FROM t1
GROUP BY id;

Related

bigquery transpose and concatenate for each record

I want to achieve the following transformation.
I have last_name stored in a repeated record as follows.
data before transformation
I want to achieve the following.
data after transformation
Example with sample data created.
create or replace table t1.cte1 as
WITH t1 AS (
SELECT 1 as id,'eren' AS last_name UNION ALL
SELECT 1 as id,'yilmaz' AS last_name UNION ALL
SELECT 1 as id,'kaya' AS last_name
)
SELECT id,ARRAY_AGG(STRUCT(last_name)) AS last_name_rec
FROM t1
GROUP BY id;
with test as (
select x.id, x.lname_agg,y.last_name from
(
select id, STRING_AGG(h.last_name,' ') lname_agg FROM
t1.cte1
LEFT JOIN
UNNEST(last_name_rec) AS h
group by id
) x,
(select id,h.last_name last_name FROM
t1.cte1
LEFT JOIN
UNNEST(last_name_rec) AS h
group by last_name,id) y
) select id ,sp.string_flatten_dedup( lname_agg,' ') concat_last_name, last_name from test;
I'm not sure either if I should store it as an array instead of a concatenated field but it would be good to know how to achieve both.
storing the concat_last_name as an array
I have achieved the first transformation as follows but I had to dedup the concatenated field with a function I wrote.
I'm sure there is a much better way of achieving this.
with test as (
select x.id id, x.lname_agg,y.last_name from
(
select id, STRING_AGG(h.last_name,' ') lname_agg FROM
small_test
LEFT JOIN
UNNEST(last_name_rec) AS h
group by id
) x,
(select id,h.last_name last_name FROM
small_test
LEFT JOIN
UNNEST(last_name_rec) AS h group by last_name,id) y
) select id ,sp.string_flatten_dedup( lname_agg,' ') concat_last_name, last_name from test;
The function.
string_flatten_dedup
CREATE OR REPLACE FUNCTION
sp.string_flatten_dedup(string_value string,
delim string) AS
(
ARRAY_TO_STRING
(ARRAY(SELECT distinct string_value
FROM UNNEST(SPLIT(string_value, delim)) AS string_value
order by string_value desc, string_value),
delim)
);
before using function.
intermediate results.
Final result after applying dedup function.
final output
Updated table structure.
t1.ccte1
Yours works but I got the table structure incorrect when I first posted.
create or replace table t1.cte2 as
with your_table as (
select 1 id, ['brown', 'smith', 'jones'] last_name union all
select 2, ['ryan', 'murphy']
) select id, ln as last_name,
array_to_string(last_name, ',') as concat_last_name,
from your_table, unnest(last_name) ln;
select id, ln as last_name,
array_to_string(last_name, ',') as concat_last_name,
from t1.cte2, unnest(last_name) ln;
--fails as its not the structure I thought it was cte1 is different then cte2
select id, ln.last_name
--array_to_string(last_name, ',') as concat_last_name,
from t1.cte1, unnest(last_name_rec) ln;
Consider below approach
select id, ln as last_name,
array_to_string(last_name, ',') as concat_last_name,
from your_table, unnest(last_name) ln
if applied to sample data in your question data before transformation
with your_table as (
select 1 id, ['brown', 'smith', 'jones'] last_name union all
select 2, ['ryan', 'murphy']
)
output is
In case if you want last names as an array - you already have this array - see below for how to use it
select id, ln as last_name,
last_name as concat_last_name,
from your_table, unnest(last_name) ln
with output

how do i do this in sql?

I have a table of students, a table of courses and a table containing a studentid and a courseid, I want to find all "student friends", meaning that they do at least two equal courses
WITH StudentsSubjects AS
(
SELECT 1 AS STUDENT_ID,'MATH' AS SUBJECTT
UNION ALL
SELECT 1,'ENGLISH'
UNION ALL
SELECT 1,'HISTORY'
UNION ALL
SELECT 1,'CS'
UNION ALL
SELECT 2,'HISTORY'
UNION ALL
SELECT 2,'GEO'
UNION ALL
SELECT 3,'CS'
UNION ALL
SELECT 3,'HISTORY'
)
SELECT X.PRIMARY_STUDENT,X.SECONDARY_STUDENT
FROM
(
SELECT S.STUDENT_ID AS PRIMARY_STUDENT,S.SUBJECTT,S2.STUDENT_ID AS SECONDARY_STUDENT,
ROW_NUMBER()OVER(PARTITION BY S.STUDENT_ID,S2.STUDENT_ID ORDER BY S.STUDENT_ID)D
FROM StudentsSubjects AS S
JOIN StudentsSubjects AS S2 ON S.STUDENT_ID<>S2.STUDENT_ID AND S.SUBJECTT=S2.SUBJECTT
)X WHERE X.D>1
ORDER BY X.PRIMARY_STUDENT
Common Table Expression StudentsSubjects is a substitute for your "a table containing a studentid and a courseid". Replace it with your actual table name

BigQuery: Union on repreated fields with different order of fields

How to make a UNION ALL work for repeated fields if the order of the fields does not match?
In the example below I try to UNION data_1_nested and data_2_nested, while the repeated field nested has two fields: id and age but in different order.
I could UNNEST and renest but this would not be very helpful if I have more then 1 nested field that I need to UNION on.
Example:
with
data_1 as (
Select 'a123' as id, 1 as age, 'a' as grade
union all
Select 'a123' as id, 3 as age,'b' as grade
union all
Select 'a123' as id, 4.5 as age,'c' as grade
)
,
data_2 as (
Select 'b456' as id, 6 as age,'e' as grade
union all
Select 'b456' as id, 5 as age,'f' as grade
union all
Select 'b456' as id, 2.5 as age,'g' as grade
)
,
data_1_nested as (
SELECT id,
array_agg(STRUCT(
age,grade
)) as nested
from data_1
group by 1
)
,
data_2_nested as (
SELECT id,
array_agg(STRUCT(
grade, age
)) as nested
from data_2
group by 1
)
SELECT * from data_1_nested
union all
SELECT * from data_2_nested
Below should work for you
select * from data_1_nested
union all
select id, array(select as struct age, grade from t.nested) from data_2_nested t
if applied to sample data from your question - output is
I modified your data a little bit to make 2 nested fields that need to be union. I also added a JS function for parsing the JSON. It is an ugly solution, but it seems to be working. Not sure if it is scalable (how many functions have to be created to covert different nested fields).
CREATE TEMP FUNCTION JsonToItems(input STRING)
RETURNS ARRAY<STRUCT<age INT64, grade STRING>>
LANGUAGE js AS """
return JSON.parse(input);
""";
with
data_1 as (
Select 'a123' as id, 1 as age, 'a' as grade
union all
Select 'a123' as id, 3 as age,'b' as grade
union all
Select 'a123' as id, 4.5 as age,'c' as grade
)
,
data_2 as (
Select 'b456' as id, 6 as age,'e' as grade
union all
Select 'b456' as id, 5 as age,'f' as grade
union all
Select 'b456' as id, 2.5 as age,'g' as grade
)
,
data_1_nested as (
SELECT id,
array_agg(STRUCT(
age,grade
)) as nested,
array_agg(STRUCT(
age,grade
)) as nested2
from data_1
group by 1
)
,
data_2_nested as (
SELECT id,
array_agg(STRUCT(
grade, age
)) as nested,
array_agg(STRUCT(
grade, age
)) as nested2
from data_2
group by 1
)
select id, JsonToItems(json), JsonToItems(json2) from (
SELECT id, TO_JSON_STRING(nested) as json, TO_JSON_STRING(nested2) as json2 from data_1_nested
union all
SELECT id, TO_JSON_STRING(nested) as json, TO_JSON_STRING(nested2) as json2 from data_2_nested
);

Select number of IDs in more than one table (from three tables)

I need the count of this:
select distinct ID
from (
select ID from A
union all
select ID from B
union all
select ID from C
) ids
GROUP BY ID HAVING COUNT(*) > 1;
but I have no idea how to do it.
Use a subquery:
select count(*)
from (select ID
from (select ID from A
union all
select ID from B
union all
select ID from C
) ids
group by ID
having count(*) > 1
) i;
SELECT DISTINCT is almost never needed with GROUP BY and definitely not in this case.
You just want to find the id that appear 2 more times in the A,B,C table, the SQL is below:
select count(1) from (
select
id,
count(1)
from
(
select ID from A
union all
select ID from B
union all
select ID from C
)
group by id having(count(1)>1)
) tmp

SQL - How to Order By in UNION query

Is there a way to union two tables, but keep the rows from the first table appearing first in the result set? However orderby column is not in select query
For example:
Table 1
name surname
-------------------
John Doe
Bob Marley
Ras Tafari
Table 2
name surname
------------------
Lucky Dube
Abby Arnold
Result
Expected Result:
name surname
-------------------
John Doe
Bob Marley
Ras Tafari
Lucky Dube
Abby Arnold
I am bringing Data by following query
SELECT name,surname FROM TABLE 1 ORDER BY ID
UNION
SELECT name,surname FROM TABLE 2
The above query is not keeping track of order by after union.
P.S - I dont want to show ID in my select query
I am getting ORDER BY Column by joining tables. Following is my real query
SELECT tbl_Event_Type_Sort_Orders.Appraisal_Event_Type_ID AS Appraisal_Event_Type_ID , ISNULL(tbl_Appraisal_Event_Types.Appraisal_Event_Type_Display_Name, 'UnCategorized') AS Appraisal_Event_Type_Display_Name
INTO #temptbl
FROM tbl_Event_Type_Sort_Orders
INNER JOIN tbl_Appraisal_Event_Types
ON tbl_Event_Type_Sort_Orders.Appraisal_Event_Type_ID = tbl_Appraisal_Event_Types.Appraisal_Event_Type_ID
WHERE 1=1
AND User_Name='abc'
ORDER BY tbl_Event_Type_Sort_Orders.Sort_Order
SELECT * FROM #temptbl
UNION
SELECT DISTINCT (tbl_Appraisal_Event_Types.Appraisal_Event_Type_ID) AS Appraisal_Event_Type_ID , ISNULL(tbl_Appraisal_Event_Types.Appraisal_Event_Type_Display_Name, 'UnCategorized') AS Appraisal_Event_Type_Display_Name
FROM tbl_Appraisal_Event_Types
INNER JOIN tbl_Appraisal_Events
ON tbl_Appraisal_Event_Types.Appraisal_Event_Type_ID = tbl_Appraisal_Events.Event_Type_ID
INNER JOIN tbl_Appraisals
ON tbl_Appraisal_Events.Appraisal_ID = tbl_Appraisal_Events.Appraisal_ID
WHERE 1=1
AND ((tbl_Appraisals.Assigned_To_Staff_User) = 'abc' OR (tbl_Appraisals.Assigned_To_Staff_User2) = 'abc' OR (tbl_Appraisals.Assigned_To_Staff_User3) = 'abc')
Put a UNION ALL in a derived table. To keep duplicate elimination, do select distinct and also add a NOT EXISTS to second select to avoid returning same person twice if found in both tables:
select name, surname
from
(
select distinct name, surname, 1 as tno
from table1
union all
select distinct name, surname, 2 as tno
from table2 t2
where not exists (select * from table1 t1
where t2.name = t1.name
and t2.surname = t1.surname)
) dt
order by tno, surname, name
You can use a column for the table and one for the ID to order by:
SELECT x.name, x.surname FROM (
SELECT ID, TableID = 1, name, surname
FROM table1
UNION ALL
SELECT ID = -1, TableID = 2, name, surname
FROM table2
) x
ORDER BY x.TableID, x.ID
You can write as below, if you are ok with duplicate data then please use UNION ALL it will be faster:
SELECT NAME, surname FROM (
SELECT ID,name,surname FROM TABLE 1
UNION
SELECT ID,name,surname FROM TABLE 2 ) t ORDER BY ID
this will order the first row sets first then by anything you need
(haven't tested the code)
;with cte_1
as
(SELECT ID,name,surname,1 as table_id FROM TABLE 1
UNION
SELECT ID,name,surname,2 as table_id FROM TABLE 2 )
SELECT name, surname
FROM cte_1
ORDER BY table_id,ID
simply use a UNION clause with out order by.
SELECT name,surname FROM TABLE 1
UNION
SELECT name,surname FROM TABLE 2
if you wanted to order first table use the below query.
;WITH cte_1
AS
(SELECT name,surname,ROW_NUMBER()OVER(ORDER BY Id)b FROM TABLE 1 )
SELECT name,surname
FROM cte_1
UNION
SELECT name,surname
FROM TABLE 2