bigquery transpose and concatenate for each record - sql

I want to achieve the following transformation.
I have last_name stored in a repeated record as follows.
data before transformation
I want to achieve the following.
data after transformation
Example with sample data created.
create or replace table t1.cte1 as
WITH t1 AS (
SELECT 1 as id,'eren' AS last_name UNION ALL
SELECT 1 as id,'yilmaz' AS last_name UNION ALL
SELECT 1 as id,'kaya' AS last_name
)
SELECT id,ARRAY_AGG(STRUCT(last_name)) AS last_name_rec
FROM t1
GROUP BY id;
with test as (
select x.id, x.lname_agg,y.last_name from
(
select id, STRING_AGG(h.last_name,' ') lname_agg FROM
t1.cte1
LEFT JOIN
UNNEST(last_name_rec) AS h
group by id
) x,
(select id,h.last_name last_name FROM
t1.cte1
LEFT JOIN
UNNEST(last_name_rec) AS h
group by last_name,id) y
) select id ,sp.string_flatten_dedup( lname_agg,' ') concat_last_name, last_name from test;
I'm not sure either if I should store it as an array instead of a concatenated field but it would be good to know how to achieve both.
storing the concat_last_name as an array
I have achieved the first transformation as follows but I had to dedup the concatenated field with a function I wrote.
I'm sure there is a much better way of achieving this.
with test as (
select x.id id, x.lname_agg,y.last_name from
(
select id, STRING_AGG(h.last_name,' ') lname_agg FROM
small_test
LEFT JOIN
UNNEST(last_name_rec) AS h
group by id
) x,
(select id,h.last_name last_name FROM
small_test
LEFT JOIN
UNNEST(last_name_rec) AS h group by last_name,id) y
) select id ,sp.string_flatten_dedup( lname_agg,' ') concat_last_name, last_name from test;
The function.
string_flatten_dedup
CREATE OR REPLACE FUNCTION
sp.string_flatten_dedup(string_value string,
delim string) AS
(
ARRAY_TO_STRING
(ARRAY(SELECT distinct string_value
FROM UNNEST(SPLIT(string_value, delim)) AS string_value
order by string_value desc, string_value),
delim)
);
before using function.
intermediate results.
Final result after applying dedup function.
final output
Updated table structure.
t1.ccte1
Yours works but I got the table structure incorrect when I first posted.
create or replace table t1.cte2 as
with your_table as (
select 1 id, ['brown', 'smith', 'jones'] last_name union all
select 2, ['ryan', 'murphy']
) select id, ln as last_name,
array_to_string(last_name, ',') as concat_last_name,
from your_table, unnest(last_name) ln;
select id, ln as last_name,
array_to_string(last_name, ',') as concat_last_name,
from t1.cte2, unnest(last_name) ln;
--fails as its not the structure I thought it was cte1 is different then cte2
select id, ln.last_name
--array_to_string(last_name, ',') as concat_last_name,
from t1.cte1, unnest(last_name_rec) ln;

Consider below approach
select id, ln as last_name,
array_to_string(last_name, ',') as concat_last_name,
from your_table, unnest(last_name) ln
if applied to sample data in your question data before transformation
with your_table as (
select 1 id, ['brown', 'smith', 'jones'] last_name union all
select 2, ['ryan', 'murphy']
)
output is
In case if you want last names as an array - you already have this array - see below for how to use it
select id, ln as last_name,
last_name as concat_last_name,
from your_table, unnest(last_name) ln
with output

Related

How to union an array when grouping?

I am trying to combine multiple array columns into one with distinct elements and then get a count of distinct elements. How can I do something like that in postgres?
create temp table t as ( select 'james' as fn, array ['bond', 'milner'] as ln );
create temp table tt as ( select 'james' as fn, array ['mcface', 'milner'] as ln );
-- expected value: james, 3
select x.name,
array_length()-- what to do here?
from (
select fn, ln
from t
union
select fn, ln
from tt
) as x
group by x.name
You should unnest the arrays in the inner queries:
select x.fn,
count(elem)
from (
select fn, unnest(ln) as elem
from t
union
select fn, unnest(ln) as elem
from tt
) as x
group by x.fn
Db<>fiddle.
Why do you (want to) use arrays? That's not needed. Simply have a derived table using UNION, which eliminates duplicates, GROUP BY the name and use count().
SELECT name,
count(*)
FROM (SELECT name,
ln
FROM t
UNION
SELECT name,
ln
FROM tt) AS x
GROUP BY name;
db<>fiddle
Side note: 9.3 is out of support for a while. Consider upgrading.

How to dedup array_agg in bigquery

I created a new table with repeating records with duplicates.
I am trying to find the most efficient way to deduplicate records as this will be run
on a table with millions of records.
If you using multiple CTE's nested does it matter what your data structure is the processing is done in memory or does it write to temp tables when there is a lot of data.
create or replace table t1.cte4 as
WITH t1 AS (
SELECT 1 as id,'eren' AS last_name UNION ALL
SELECT 1 as id,'yilmaz' AS last_name UNION ALL
SELECT 1 as id,'kaya' AS last_name UNION ALL
SELECT 1 as id,'kaya' AS last_name UNION ALL
SELECT 2 as id,'smith' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'brown' AS last_name
)
SELECT id,ARRAY_AGG(STRUCT(last_name)) AS last_name_rec
FROM t1
GROUP BY id;
I can remove duplicates as follows.
QUERY 1 How to dedup the concat_struct ?
select id,
STRING_AGG( distinct ln.last_name ,'~') as concat_string,
ARRAY_AGG(STRUCT( ln.last_name )) as concat_struct
from `t1.cte4`, unnest(last_name_rec) ln
group by id;
QUERY 1
QUERY 2 Is there a better way then this to dedup?
select distinct id,
TO_JSON_STRING(ARRAY_AGG(ln.last_name) OVER (PARTITION BY id)) json_string
from `t1.cte4`, unnest(last_name_rec) ln
group by id,
ln.last_name;
QUERY 2
How do I get it out of the table as distinct rather then using the CTE. This does not dedup.
select id, ARRAY_AGG(STRUCT( ln.last_name )) as concat_struct
from t1.cte4,
unnest(last_name_rec) ln group by id;
I can't do this.
select id, ARRAY_AGG(distinct STRUCT( ln.last_name )) as concat_struct from t1.cte4,
unnest(last_name_rec) ln group by id;
UPDATE: Decompose the struct before deduplication and then compose it back:
select id, ARRAY_AGG(STRUCT(last_name)) as concat_struct
from (
select id, ln.last_name
from cte4, unnest(last_name_rec) ln
group by id, ln.last_name
) d
group by id
(original answer based on unwanted change of table definition follows)
Just use array_agg(distinct ...):
WITH t1 AS (
SELECT 1 as id,'eren' AS last_name UNION ALL
SELECT 1 as id,'yilmaz' AS last_name UNION ALL
SELECT 1 as id,'kaya' AS last_name UNION ALL
SELECT 1 as id,'kaya' AS last_name UNION ALL
SELECT 2 as id,'smith' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'brown' AS last_name
)
SELECT id,ARRAY_AGG(distinct last_name) AS last_name_rec
FROM t1
GROUP BY id;

Oracle Regex - Remove duplicates including chinese

I'm trying to remove the duplicates in the results of a query involving listagg.
I'm using this syntax:
REGEXP_REPLACE (LISTAGG (PR.NAME, ',' ) WITHIN GROUP (ORDER BY 1),
'([^,]+)(,\1)+',
'\1') AS PRODUCERS
However, occurrences including chinese characters are not removed:
Any idea ?
Your regular expression does not work. If the LISTAGG output is A,A,AA then the regular expression ([^,]+)(,\1)+ does not check that it has matched a complete element of your list and will match A,A,A which is 2½ elements of the list and will give the output AA instead of the expected A,AA. Worse, if you have the string BA,BABAB,BABD then the regular expression will replace BA,BA with BA and then replace BAB,BAB with BAB and you end up with the string BABABD which does not match any of the elements of the original list.
An example demonstrating this is:
SQL Fiddle
Oracle 11g R2 Schema Setup:
CREATE TABLE names ( id, name ) AS
SELECT 1, 'A' FROM DUAL UNION ALL
SELECT 2, 'A' FROM DUAL UNION ALL
SELECT 3, 'B' FROM DUAL UNION ALL
SELECT 4, 'C' FROM DUAL UNION ALL
SELECT 5, 'A' FROM DUAL UNION ALL
SELECT 6, 'AA' FROM DUAL UNION ALL
SELECT 7, 'A' FROM DUAL UNION ALL
SELECT 8, 'BA' FROM DUAL UNION ALL
SELECT 9, 'A' FROM DUAL
/
Query 1:
SELECT REGEXP_REPLACE (
LISTAGG (NAME, ',' ) WITHIN GROUP (ORDER BY 1),
'([^,]+)(,\1)+',
'\1'
) AS constant_sort
FROM names
Results:
| CONSTANT_SORT |
|---------------|
| AA,BA,C |
If you want to get the distinct elements then you can use DISTINCT (as per Littlefoot's answer) or you can COLLECT the values into a user-defined collection and then use the SET function to remove duplicates. You can then pass this de-duplicated collection to a table collection expression and use LISTAGG to get your output:
Oracle 11g R2 Schema Setup:
CREATE TYPE StringList IS TABLE OF VARCHAR2(4000)
/
Query 2:
SELECT (
SELECT LISTAGG( column_value, ',' )
WITHIN GROUP ( ORDER BY ROWNUM )
FROM TABLE( n.unique_names )
) AS agg_names
FROM (
SELECT SET( CAST( COLLECT( name ORDER BY NAME ) AS StringList ) )
AS unique_names
FROM names
) n
Results:
| AGG_NAMES |
|-------------|
| A,AA,B,BA,C |
Regarding your comment:
in the context of a bigger query involving a lot of join and given my begginers skills I would have no idea how to implement this model
For example, if your query was:
SELECT REGEXP_REPLACE(
LISTAGG (PR.NAME, ',' ) WITHIN GROUP (ORDER BY 1),
'([^,]+)(,\1)+',
'\1'
) AS PRODUCERS,
other_column1,
other_column2
FROM table1 pr
INNER JOIN table2 t2
ON (pr.some_condition = t2.some_condition )
WHERE t2.some_other_condition = 'TRUE'
GROUP BY other_column1, other_column2
Then you can change it to:
SELECT (
SELECT LISTAGG( COLUMN_VALUE, ',' ) WITHIN GROUP ( ORDER BY ROWNUM )
FROM TABLE( t.PRODUCERS )
) AS producers,
other_column1,
other_column2
FROM (
SELECT SET( CAST( COLLECT( PR.name ORDER BY PR.NAME ) AS StringList ) )
AS PRODUCERS,
other_column1,
other_column2
FROM table1 pr
INNER JOIN table2 t2
ON (pr.some_condition = t2.some_condition )
WHERE t2.some_other_condition = 'TRUE'
GROUP BY other_column1, other_column2
) t
(I can't see images; company policy).
Why wouldn't you remove duplicates before applying LISTAGG? Something like
select listagg(x.distinct_name, ',') within group (order by 1) producers
from (select DISTINCT name distinct_name
from some_table
) x
Another way to remove duplicates is to use window functions and case:
select listagg(case when seqnum = 1 then name end, ',') within group (order by 1) as producers
from (select . . .,
row_number() over (partition by name order by name) as seqnum
from . . .
) t
This does require modifications to the rest of the query, but you should still be able to do the rest of the aggregations and computations.

Oracle: SQL Dynamic cursor statement

I have a dynamic temporary table like below.
Table name for assumption: TB_EMP_TEMP_TABLE
Column1 | column2 | column3
Emp_NM | EMP_ID |TB_EMP_DTLS
Emp_Adr | EMP_ID |TB_EMP_DTLS
Emp_Sal | EMP_ID |TB_EMP_OTHER
The above data is retrieved as a Cursor(Emp_cursor) and i need to construct a dynamic SQL Query as below based on cursor data.
Expected Output:
SELECT TB_EMP_DTLS.EMP_NM,TB_EMP_DTLS.EMP_Adr,TB_EMP_OTHER.EMP_SAL
FROM TB_EMP_DTLS,TB_EMP_OTHER
WHERE TB_EMP_DTLS.EMP_ID=TB_EMP_OTHER.EMP_ID
I havent worked extensively on PLSQL/Cursor concepts. How the cursor can be looped to get expected output.
If i understand it right, you want column1 values selected from column3 tables joined by column2 columns.
It's not elegant but should work:
select listagg(v, ' ') within group (order by n asc) my_cursor from (
with
tb as (select distinct column3 val from tb_emp_temp_table), --tables
sl as (select distinct column3||'.'||column1 val from tb_emp_temp_table), --selected columns
pr as (select distinct column3||'.'||column2 val from tb_emp_temp_table) --predicates
select 1 n, 'SELECT' v from dual
union
select 2 n, listagg(val, ', ') within group (order by val) v from sl
union
select 3 n, 'FROM' v from dual
union
select 4 n, listagg(val, ', ') within group (order by val) v from tb
union
select 5 n, 'WHERE' v from dual
union
select 6 n, listagg(pra.val||'='||prb.val, ' AND ') within group (order by pra.val) v from pr pra, pr prb where pra.val != prb.val
)

SQL - How to Order By in UNION query

Is there a way to union two tables, but keep the rows from the first table appearing first in the result set? However orderby column is not in select query
For example:
Table 1
name surname
-------------------
John Doe
Bob Marley
Ras Tafari
Table 2
name surname
------------------
Lucky Dube
Abby Arnold
Result
Expected Result:
name surname
-------------------
John Doe
Bob Marley
Ras Tafari
Lucky Dube
Abby Arnold
I am bringing Data by following query
SELECT name,surname FROM TABLE 1 ORDER BY ID
UNION
SELECT name,surname FROM TABLE 2
The above query is not keeping track of order by after union.
P.S - I dont want to show ID in my select query
I am getting ORDER BY Column by joining tables. Following is my real query
SELECT tbl_Event_Type_Sort_Orders.Appraisal_Event_Type_ID AS Appraisal_Event_Type_ID , ISNULL(tbl_Appraisal_Event_Types.Appraisal_Event_Type_Display_Name, 'UnCategorized') AS Appraisal_Event_Type_Display_Name
INTO #temptbl
FROM tbl_Event_Type_Sort_Orders
INNER JOIN tbl_Appraisal_Event_Types
ON tbl_Event_Type_Sort_Orders.Appraisal_Event_Type_ID = tbl_Appraisal_Event_Types.Appraisal_Event_Type_ID
WHERE 1=1
AND User_Name='abc'
ORDER BY tbl_Event_Type_Sort_Orders.Sort_Order
SELECT * FROM #temptbl
UNION
SELECT DISTINCT (tbl_Appraisal_Event_Types.Appraisal_Event_Type_ID) AS Appraisal_Event_Type_ID , ISNULL(tbl_Appraisal_Event_Types.Appraisal_Event_Type_Display_Name, 'UnCategorized') AS Appraisal_Event_Type_Display_Name
FROM tbl_Appraisal_Event_Types
INNER JOIN tbl_Appraisal_Events
ON tbl_Appraisal_Event_Types.Appraisal_Event_Type_ID = tbl_Appraisal_Events.Event_Type_ID
INNER JOIN tbl_Appraisals
ON tbl_Appraisal_Events.Appraisal_ID = tbl_Appraisal_Events.Appraisal_ID
WHERE 1=1
AND ((tbl_Appraisals.Assigned_To_Staff_User) = 'abc' OR (tbl_Appraisals.Assigned_To_Staff_User2) = 'abc' OR (tbl_Appraisals.Assigned_To_Staff_User3) = 'abc')
Put a UNION ALL in a derived table. To keep duplicate elimination, do select distinct and also add a NOT EXISTS to second select to avoid returning same person twice if found in both tables:
select name, surname
from
(
select distinct name, surname, 1 as tno
from table1
union all
select distinct name, surname, 2 as tno
from table2 t2
where not exists (select * from table1 t1
where t2.name = t1.name
and t2.surname = t1.surname)
) dt
order by tno, surname, name
You can use a column for the table and one for the ID to order by:
SELECT x.name, x.surname FROM (
SELECT ID, TableID = 1, name, surname
FROM table1
UNION ALL
SELECT ID = -1, TableID = 2, name, surname
FROM table2
) x
ORDER BY x.TableID, x.ID
You can write as below, if you are ok with duplicate data then please use UNION ALL it will be faster:
SELECT NAME, surname FROM (
SELECT ID,name,surname FROM TABLE 1
UNION
SELECT ID,name,surname FROM TABLE 2 ) t ORDER BY ID
this will order the first row sets first then by anything you need
(haven't tested the code)
;with cte_1
as
(SELECT ID,name,surname,1 as table_id FROM TABLE 1
UNION
SELECT ID,name,surname,2 as table_id FROM TABLE 2 )
SELECT name, surname
FROM cte_1
ORDER BY table_id,ID
simply use a UNION clause with out order by.
SELECT name,surname FROM TABLE 1
UNION
SELECT name,surname FROM TABLE 2
if you wanted to order first table use the below query.
;WITH cte_1
AS
(SELECT name,surname,ROW_NUMBER()OVER(ORDER BY Id)b FROM TABLE 1 )
SELECT name,surname
FROM cte_1
UNION
SELECT name,surname
FROM TABLE 2