Related
Say I have this
col_1 | col_2
------------
1 | a
1 | b
1 | c
2 | d
2 | e
I want result like this
col_1 | col_2_concat
-------------------
1 | a,b,c
2 | d,e
something like this I would guess:
select
col_1,
join_by_comma(col2)
from tbl
group by col_1
I think you need something like:
WITH x AS (
SELECT
'a' AS c,
'b' AS c2
UNION ALL
SELECT
'a',
'b2'
UNION ALL
SELECT
'a2',
'b3'
UNION ALL
SELECT
'a2',
'b4'
)
SELECT
c,
ARRAY_JOIN(ARRAY_AGG(c2), ',') as c2
FROM x
GROUP BY
c
I have a table that looks like this:
+----+------+------+------+------+------+
| id | col1 | col2 | col3 | col4 | col5 |
+----+------+------+------+------+------+
| a | 1 | null | null | null | null |
| b | 1 | 2 | 3 | 4 | null |
| c | 1 | 2 | 3 | 4 | 5 |
| d | 2 | 1 | 7 | null | 4 |
+----+------+------+------+------+------+
I want to create an aggregated table where for each id I want an array that contains non null value from all the other columns. The output should look like this:
+-----+-------------+
| id | agg_col |
+-----+-------------+
| a | [1] |
| b | [1,2,3,4] |
| c | [1,2,3,4,5] |
| d | [2,1,7,4] |
+-----+-------------+
Is it possible to produce the output using bigquery standard sql?
Below is not super generic solution, but works for your specific example that you provided - id is presented with alphanumeric (not starting with digit) and rest of columns are numbers - integers
#standardSQL
SELECT id,
ARRAY(SELECT * FROM UNNEST(REGEXP_EXTRACT_ALL(TO_JSON_STRING(t), r':(\d*)')) col WHERE col != '') AS agg_col_as_array,
CONCAT('[', ARRAY_TO_STRING(ARRAY(SELECT * FROM UNNEST(REGEXP_EXTRACT_ALL(TO_JSON_STRING(t), r':(\d*)')) col WHERE col != ''), ','), ']') AS agg_col_as_string
FROM `project.dataset.table` t
You can test, play with above using sample data from your question as below
#standardSQL
WITH `project.dataset.table` AS (
SELECT 'a' id, 1 col1, NULL col2, NULL col3, NULL col4, NULL col5 UNION ALL
SELECT 'b', 1, 2, 3, 4, NULL UNION ALL
SELECT 'c', 1, 2, 3, 4, 5 UNION ALL
SELECT 'd', 2, 1, 7, NULL, 4
)
SELECT id,
ARRAY(SELECT * FROM UNNEST(REGEXP_EXTRACT_ALL(TO_JSON_STRING(t), r':(\d*)')) col WHERE col != '') AS agg_col_as_array,
CONCAT('[', ARRAY_TO_STRING(ARRAY(SELECT * FROM UNNEST(REGEXP_EXTRACT_ALL(TO_JSON_STRING(t), r':(\d*)')) col WHERE col != ''), ','), ']') AS agg_col_as_string
FROM `project.dataset.table` t
-- ORDER BY id
with result as
Row id agg_col_as_array agg_col_as_string
1 a 1 [1]
2 b 1 [1,2,3,4]
2
3
4
3 c 1 [1,2,3,4,5]
2
3
4
5
4 d 2 [2,1,7,4]
1
7
4
Do you think it is possible to do this by mentioning specific columns and then binding them into an array?
Sure, it is doable - see below
#standardSQL
WITH `project.dataset.table` AS (
SELECT 'a' id, 1 col1, NULL col2, NULL col3, NULL col4, NULL col5 UNION ALL
SELECT 'b', 1, 2, 3, 4, NULL UNION ALL
SELECT 'c', 1, 2, 3, 4, 5 UNION ALL
SELECT 'd', 2, 1, 7, NULL, 4
)
SELECT id,
ARRAY(
SELECT col
FROM UNNEST([col1, col2, col3, col4, col5]) col
WHERE NOT col IS NULL
) AS agg_col_as_array,
CONCAT('[', ARRAY_TO_STRING(
ARRAY(
SELECT CAST(col AS STRING)
FROM UNNEST([col1, col2, col3, col4, col5]) col
WHERE NOT col IS NULL
), ','), ']') AS agg_col_as_string
FROM `project.dataset.table` t
-- ORDER BY id
BUT ... this is not the best option you have as you need to manage and adjust number and names of columns in each case for different uses
Below solution is adjusted version of my original answer to address your latest comment - Actually the sample was too simple. Both of my id and other columns have alphanumeric and special characters.
#standardSQL
WITH `project.dataset.table` AS (
SELECT 'a' id, 1 col1, NULL col2, NULL col3, NULL col4, NULL col5 UNION ALL
SELECT 'b', 1, 2, 3, 4, NULL UNION ALL
SELECT 'c', 1, 2, 3, 4, 5 UNION ALL
SELECT 'd', 2, 1, 7, NULL, 4
)
SELECT id,
ARRAY(
SELECT col
FROM UNNEST(REGEXP_EXTRACT_ALL(TO_JSON_STRING(t), r':(.*?)(?:,|})')) col WITH OFFSET
WHERE col != 'null' AND OFFSET > 0
) AS agg_col_as_array,
CONCAT('[', ARRAY_TO_STRING(
ARRAY(
SELECT col
FROM UNNEST(REGEXP_EXTRACT_ALL(TO_JSON_STRING(t), r':(.*?)(?:,|})')) col WITH OFFSET
WHERE col != 'null' AND OFFSET > 0
), ','), ']') AS agg_col_as_string
FROM `project.dataset.table` t
-- ORDER BY id
both with same result as before
Row id agg_col_as_array agg_col_as_string
1 a 1 [1]
2 b 1 [1,2,3,4]
2
3
4
3 c 1 [1,2,3,4,5]
2
3
4
5
4 d 2 [2,1,7,4]
1
7
4
for example we have table with three columns
Column A | Column B | Column C
-------------------------------
P1 | 10 | A
P1 | 20 | B
P1 | 30 | C
P2 | 30 | D
P2 | 40 | E
How do I have to write SELECT query in Microsoft SQL to have this kind of result
Column A | Column B | Column C
------------------------------------------
P1 | 10, 20, 30 | A, B, C
P2 | 30, 40 | D, E
try this
CREATE TABLE #temp(ColumnA VARCHAR(10),ColumnB INT ,ColumnC VARCHAR(10))
--insert rows
SELECT t.[ColumnA] ,
[ColumnB] = STUFF((SELECT ','+cast([ColumnB] AS VARCHAR ) FROM #Temp WHERE [ColumnA] = t.[ColumnA] FOR XML PATH('')),1,1,''),
[ColumnC] = STUFF((SELECT ','+[ColumnC] FROM #Temp WHERE [ColumnA] = t.[ColumnA] FOR XML PATH('')),1,1,'')
FROM #Temp t
GROUP BY t.[ColumnA]
i test it
If you are running SQL Server 2017, T-SQL enhancements introduced to SQL developers include the STRING_AGG string concatenation function
You can simply use following SELECT statement for aggregating string values from different rows into a single column value
select
ColumnA,
string_agg(ColumnB,',') ColumnB,
string_agg(ColumnC,',') ColumnC
from TestTable1
group by ColumnA
You could try xml method with stuff() function
select t.[Column A] ,
[Column B] =
stuff(
(select ','+cast([Column B] as varchar) from <table> where [Column A] = t.[Column A] for xml path('')),
1,1,''),
[Column C] = stuff(
(select ','+[Column C] from <table> where [Column A] = t.[Column A] for xml path('')),
1,1,'')
from <table> t
group by t.[Column A]
SELECT DISTINCT COLA,
STUFF((select ', '+ColB from T1 AS B where A.ColA=B.ColA FOR XML PATH(''),Type).value('.','NVARCHAR(MAX)'),1,2,'') AS ColB,
STUFF((select ', '+ColC from T1 AS B where A.ColA=B.ColA FOR XML PATH(''),Type).value('.','NVARCHAR(MAX)'),1,2,'') AS ColC
FROM T1 A
I am using Impala SQL. I currently have a database with 3 columns: Account, Date, Type.
Under Type there are various data strings describing the associated type, but some are equal to 'UNKNOWN' and some are null.
I'd like to create another column Fixed_Type. The values in Fixed_Type should come from the Type column.
If the value in Type is either null or 'UNKNOWN', it should get last valid value in the Type column, partitioned by account and ordered by date.
If the partition begins with null or 'UNKNOWN', then the value in Fixed_Type should be the first valid value from Type.
For example:
Account | Date | Type | Fixed_Type
1 Jan data1 data1
1 Feb 'UNKNOWN' data1
1 Mar null data1
2 Apr data2 data2
2 May null data2
2 Jun null data2
2 Jul data3 data3
3 Feb 'UNKNOWN' data4
3 Mar 'UNKNOWN' data4
3 Apr data4 data4
I started doing this in Oracle but then realized there is no functionality analogous to IGNORE NULLS implemented in Impala.
This is what I was thinking to do in Oracle (I realize this only handles forward fill on nulls):
select account, date, type,
case when type is null
then last_value(type ignore nulls)
over (partition by account order by date)
else type
end as fixed_type
I use postgresql to test the query, so not 100% sure if you can make it work in your system. WITH can be replace with subquerys. Also have to change your Date to number so ORDER BY work as intended.
enumerateWords: Create an enumerated list for valid words.
createFlag: Set a flag so you can validate when the next group will start.
createGrp : Using the flags and SUM() you create the groups.
Finally you join the group with the enumerated list to assign the Fixed_Type
special condition in the JOIN c.grp = 0 and e.rn =1 for when the first row is NULL or 'UNKNOWN'
Sql Fiddle Demo
WITH enumerateWords as (
SELECT "Account", "Date", "Type",
row_number() over (partition by "Account"
order by "Date") rn
FROM Days
WHERE "Type" <> '''UNKNOWN''' AND "Type" IS NOT NULL
), createFlag as (
SELECT *, CASE WHEN "Type" = '''UNKNOWN''' OR "Type" IS NULL
THEN 0
ELSE 1
END as FLAG
FROM Days
), createGrp as (
SELECT *,
SUM(FLAG) OVER (PARTITION BY "Account"
ORDER BY "Date") as grp
FROM createFlag
)
SELECT c.*, e."Account", e."Date", e."Type" as "Fixed_Type"
FROM createGrp c
JOIN enumerateWords e
ON c."Account" = e."Account"
AND ( c.grp = e.rn
OR (c.grp = 0 and e.rn = 1)
)
OUTPUT
As you can see createGrp display Fixed_Type type from the value on DB, but enumerateWords create it from Type.
And you can see how flag and grp work together to see detect the changes.
| createGrp || enumerateWords |
|---------|------|-----------|------------|------|-----||---------|----|------------|
| Account | Date | Type | Fixed_Type | flag | grp || Account | rn | Fixed_Type |
|---------|------|-----------|------------|------|-----||---------|----|------------|
| 1 | 1 | data1 | data1 | 1 | 1 || 1 | 1 | data1 |
| 1 | 2 | 'UNKNOWN' | data1 | 0 | 1 || 1 | 1 | data1 |
| 1 | 3 | (null) | data1 | 0 | 1 || 1 | 1 | data1 |
|---------|------|-----------|------------|------|-----||---------|----|------------|
| 2 | 4 | data2 | data2 | 1 | 1 || 2 | 1 | data2 |
| 2 | 5 | (null) | data2 | 0 | 1 || 2 | 1 | data2 |
| 2 | 6 | (null) | data2 | 0 | 1 || 2 | 1 | data2 |
| 2 | 7 | data3 | data3 | 1 | 2 || 2 | 2 | data3 |
| 2 | 8 | (null) | data3 | 0 | 2 || 2 | 2 | data3 |
|---------|------|-----------|------------|------|-----||---------|----|------------|
| 3 | 9 | 'UNKNOWN' | data4 | 0 | 0 || 3 | 1 | data4 | <=
| 3 | 10 | 'UNKNOWN' | data4 | 0 | 0 || 3 | 1 | data4 | <=
| 3 | 11 | data4 | data4 | 1 | 1 || 3 | 1 | data4 |
^^^ special case 0 = 1
Oracle Setup:
CREATE TABLE Table_Name ( Acct, Dt, Type ) AS
SELECT 1, DATE '2016-01-01', 'Data1' FROM DUAL UNION ALL
SELECT 1, DATE '2016-02-01', 'UNKNOWN' FROM DUAL UNION ALL
SELECT 1, DATE '2016-03-01', NULL FROM DUAL UNION ALL
SELECT 2, DATE '2016-04-01', 'Data2' FROM DUAL UNION ALL
SELECT 2, DATE '2016-05-01', NULL FROM DUAL UNION ALL
SELECT 2, DATE '2016-06-01', NULL FROM DUAL UNION ALL
SELECT 2, DATE '2016-07-01', 'Data3' FROM DUAL UNION ALL
SELECT 3, DATE '2016-02-01', 'UNKNOWN' FROM DUAL UNION ALL
SELECT 3, DATE '2016-03-01', 'UNKNOWN' FROM DUAL UNION ALL
SELECT 3, DATE '2016-04-01', 'Data4' FROM DUAL;
Query:
SELECT Acct,
Dt,
Type,
Fixed_Type
FROM (
SELECT r.Acct,
r.Dt,
r.Type,
t.type AS fixed_type,
ROW_NUMBER() OVER ( PARTITION BY r.Acct, r.dt
ORDER BY SIGN( ABS( t.dt - r.dt ) ),
SIGN( t.dt - r.dt ),
ABS( t.dt - r.dt ) ) AS rn
FROM table_name r
LEFT OUTER JOIN
table_name t
ON ( r.acct = t.acct
AND t.type IS NOT NULL
AND t.type <> 'UNKNOWN' )
)
WHERE rn = 1
ORDER BY acct, dt;
Explanation:
If you join the table to itself so both tables have the same account number then you can compare each row per account with all the other rows in that same account. However, we aren't interested in comparing to all the rows but just to the rows that aren't NULL or 'UNKNOWN' so we get the join condition:
ON ( r.acct = t.acct
AND t.type IS NOT NULL
AND t.type <> 'UNKNOWN' )
A LEFT OUTER JOIN is used just in case there is an account number that has all NULL or 'UNKNOWN' values for its type so that rows aren't excluded.
Then it is a matter of finding the row that is most recent. In Oracle, if you subtract one date from another then you get the number of days (or fraction of days) difference - so:
SIGN( ABS( t.dt - r.dt ) ) will give 0 if the two dates are identical or 1 if they are different. Ordering by this first means that if there is a value with the same date then it will be preferred over non-identical dates;
SIGN( t.dt - r.dt ) will return 0 if the two dates are identical (but that's already been filtered on in the previous statement) or -1 if the compared date is before the current row or +1 if it is after - this is used to prefer a before date to an after date.
ABS( t.dt - r.dt ) will order the dates by closest together.
So the ORDER BY clause effectively states: ORDER BY identical dates first, then dates before (closest to r.dt first) and finally dates after (closest to r.dt first).
Then that is all placed in an in-line view and filtered to get the best match for each row (WHERE rn = 1).
Output:
ACCT DT TYPE FIXED_TYPE
---------- ------------------- ------- ----------
1 2016-01-01 00:00:00 Data1 Data1
1 2016-02-01 00:00:00 UNKNOWN Data1
1 2016-03-01 00:00:00 Data1
2 2016-04-01 00:00:00 Data2 Data2
2 2016-05-01 00:00:00 Data2
2 2016-06-01 00:00:00 Data2
2 2016-07-01 00:00:00 Data3 Data3
3 2016-02-01 00:00:00 UNKNOWN Data4
3 2016-03-01 00:00:00 UNKNOWN Data4
3 2016-04-01 00:00:00 Data4 Data4
Here is a solution, similar to Juan Carlos's, using the analytic function count and a case expression to create the groups in one pass.
I created more input data to test, for example, what happens when an account only has null and/or 'UNKNOWN' as type (making sure the left outer join works as intended).
create table table_name ( acct, dt, type ) as
select 1, date '2016-01-01', 'Data1' from dual union all
select 1, date '2016-02-01', 'UNKNOWN' from dual union all
select 1, date '2016-03-01', null from dual union all
select 2, date '2016-04-01', 'Data2' from dual union all
select 2, date '2016-05-01', null from dual union all
select 2, date '2016-06-01', null from dual union all
select 2, date '2016-07-01', 'Data3' from dual union all
select 3, date '2016-02-01', 'UNKNOWN' from dual union all
select 3, date '2016-03-01', 'UNKNOWN' from dual union all
select 3, date '2016-04-01', 'Data4' from dual union all
select 3, date '2016-05-01', 'UNKNOWN' from dual union all
select 3, date '2016-06-01', 'Data5' from dual union all
select 4, date '2016-02-01', null from dual union all
select 4, date '2016-03-01', 'UNKNOWN' from dual;
SQL> select * from table_name;
ACCT DT TYPE
---------- ---------- -------
1 2016-01-01 Data1
1 2016-02-01 UNKNOWN
1 2016-03-01
2 2016-04-01 Data2
2 2016-05-01
2 2016-06-01
2 2016-07-01 Data3
3 2016-02-01 UNKNOWN
3 2016-03-01 UNKNOWN
3 2016-04-01 Data4
3 2016-05-01 UNKNOWN
3 2016-06-01 Data5
4 2016-02-01
4 2016-03-01 UNKNOWN
14 rows selected.
Query:
with
prep(acct, dt, type, gp) as (
select acct, dt, type,
count(case when type != 'UNKNOWN' then 1 end)
over (partition by acct order by dt)
from table_name
),
no_nulls(acct, type, gp) as (
select acct, type, gp
from prep
where type != 'UNKNOWN'
)
select p.acct, p.dt, p.type, n.type as fixed_type
from prep p left outer join no_nulls n
on p.acct = n.acct and (p.gp = n.gp or p.gp = 0 and n.gp = 1)
order by acct, dt;
Output:
ACCT DT TYPE FIXED_TYPE
---------- ---------- ------- ----------
1 2016-01-01 Data1 Data1
1 2016-02-01 UNKNOWN Data1
1 2016-03-01 Data1
2 2016-04-01 Data2 Data2
2 2016-05-01 Data2
2 2016-06-01 Data2
2 2016-07-01 Data3 Data3
3 2016-02-01 UNKNOWN Data4
3 2016-03-01 UNKNOWN Data4
3 2016-04-01 Data4 Data4
3 2016-05-01 UNKNOWN Data4
3 2016-06-01 Data5 Data5
4 2016-02-01
4 2016-03-01 UNKNOWN
14 rows selected.
I have two tables:
a (column1, column2, column3)
b (column6, column7, column8)
a.column1 is foreign key as in b.column6.
One row from table a sometimes matches 3 rows in table b, sometimes 5, sometimes 1.... no definite count of returned rows.
I have a business requirement to flip all corresponding columns in table b into one row.. like this:
a.column1, a.column2, a.column3, b.column7, b.column8, b.column7, b.column8
a.column1, a.column2, a.column3, b.column7, b.column8
a.column1, a.column2, a.column3, b.column7, b.column8, b.column7, b.column8, b.column7, b.column8
a.column1, a.column2, a.column3, b.column7, b.column8, b.column7, b.column8, b.column7, b.column8b.column7, b.column8, b.column7, b.column8
You see , the number of columns in each row from Table a is always 3... but from table b, you might have a variable number of columns.... And column7 and column8 have to repeated appear in that order.
How can I do this? Thanks.
It sounds like you are going to need to unpivot and then pivot the data. If you have an unknown number of values you will have to use dynamic SQL but I would first suggest writing a hard-code or static version of the query first, then convert it to dynamic SQL.
The process to unpivot the data is going to take your multiple columns in tableB and convert it into multiple rows. Since you are using SQL Server 2012 you can use CROSS APPLY to unpivot the data:
select column1, column2, column3,
col = col + '_' + cast(seq as varchar(10)),
value
from
(
select a.column1, a.column2, a.column3,
b.column6, b.column7, b.column8,
row_number() over(partition by a.column1
order by a.column1) seq
from tablea a
inner join tableb b
on a.column1 = b.column6
) d
cross apply
(
select 'column6', column6 union all
select 'column7', column7 union all
select 'column8', column8
) c (col, value);
See SQL Fiddle with Demo. This will give you a result similar to:
| COLUMN1 | COLUMN2 | COLUMN3 | COL | VALUE |
| 1 | 2 | 3 | column6_1 | 1 |
| 1 | 2 | 3 | column7_1 | 18 |
| 1 | 2 | 3 | column8_1 | 56 |
| 1 | 2 | 3 | column6_2 | 1 |
| 1 | 2 | 3 | column7_2 | 25 |
| 1 | 2 | 3 | column8_2 | 89 |
As you can see you now have multiple rows that you can easily apply the pivot function to. The PIVOT code will be:
select column1, column2, column3,
column6_1, column7_1, column8_1,
column6_2, column7_2, column8_2,
column6_3, column7_3, column8_3
from
(
select column1, column2, column3,
col = col + '_' + cast(seq as varchar(10)),
value
from
(
select a.column1, a.column2, a.column3,
b.column6, b.column7, b.column8,
row_number() over(partition by a.column1
order by a.column1) seq
from tablea a
inner join tableb b
on a.column1 = b.column6
) d
cross apply
(
select 'column6', column6 union all
select 'column7', column7 union all
select 'column8', column8
) c (col, value)
) src
pivot
(
max(value)
for col in (column6_1, column7_1, column8_1,
column6_2, column7_2, column8_2,
column6_3, column7_3, column8_3)
) piv;
See SQL Fiddle with Demo. Since you stated that you might have an unknown or dynamic number of entries in tableB you will need to use dynamic SQL. This will generate a sql string that will be executed to get you the final result:
DECLARE #cols AS NVARCHAR(MAX),
#query AS NVARCHAR(MAX)
select #cols = STUFF((SELECT ',' + QUOTENAME(col +'_'+cast(seq as varchar(10)))
from
(
select row_number() over(partition by column6
order by column6) seq
from tableB
) t
cross apply
(
select 'column6', 1 union all
select 'column7', 2 union all
select 'column8', 3
) c (col, so)
group by col, so, seq
order by seq, so
FOR XML PATH(''), TYPE
).value('.', 'NVARCHAR(MAX)')
,1,1,'')
set #query = 'SELECT column1, column2, column3,' + #cols + '
from
(
select column1, column2, column3,
col = col + ''_'' + cast(seq as varchar(10)),
value
from
(
select a.column1, a.column2, a.column3,
b.column6, b.column7, b.column8,
row_number() over(partition by a.column1
order by a.column1) seq
from tablea a
inner join tableb b
on a.column1 = b.column6
) d
cross apply
(
select ''column6'', column6 union all
select ''column7'', column7 union all
select ''column8'', column8
) c (col, value)
) x
pivot
(
max(value)
for col in (' + #cols + ')
) p '
execute sp_executesql #query;
See SQL Fiddle with Demo. Both versions give a result:
| COLUMN1 | COLUMN2 | COLUMN3 | COLUMN6_1 | COLUMN7_1 | COLUMN8_1 | COLUMN6_2 | COLUMN7_2 | COLUMN8_2 | COLUMN6_3 | COLUMN7_3 | COLUMN8_3 |
| 1 | 2 | 3 | 1 | 18 | 56 | 1 | 25 | 89 | (null) | (null) | (null) |
| 2 | 4 | 6 | 2 | 78 | 245 | (null) | (null) | (null) | (null) | (null) | (null) |
| 3 | 8 | 9 | 3 | 10 | 15 | 3 | 45 | 457 | 3 | 89 | 50 |