I'd like to create a dynamic select that returns every distinct value for each column in a wide table. I.e.
select distinct #mycolumn
from #mytable
for every column and the results combined to a single table.
Edit1:
Example:
Edit2: The order of the returned data won't matter, and the source table can have all sorts of data types.
Any advice appreciated, thank you!
The only way I can think of is very cumbersome and probably extremely slow:
Using a Tally table (I've generated one using a recursive cte for the sake of this answer, but that's also not a very good way to do that...) and multiple derived tables left joined to that tally table I was able to come up with something that will generate the desired output.
However, as I wrote on the top - it's very cumbersome and probably extremely slow (I've tested only on a table with 5 columns and 6 rows so I have no idea about execution speed).
DECLARE #Count int
select #Count = COUNT(1)
FROM YourTable
;with tally as (
select 1 as n
union all
select n + 1
from tally
where n < #Count
)
SELECT Column1, Column2, Column3, Column4, Column5
FROM tally
LEFT JOIN
(
SELECT Column1, ROW_NUMBER() OVER (ORDER BY Column1) rn
FROM
(
SELECT DISTINCT Column1
FROM YourTable
) t1
) d1 ON(n = d1.rn)
LEFT JOIN
(
SELECT Column2, ROW_NUMBER() OVER (ORDER BY Column2) rn
FROM
(
SELECT DISTINCT Column2
FROM YourTable
) t1
) d2 ON(n = d2.rn)
LEFT JOIN
(
SELECT Column3, ROW_NUMBER() OVER (ORDER BY Column3) rn
FROM
(
SELECT DISTINCT Column3
FROM YourTable
) t1
) d3 ON(n = d3.rn)
LEFT JOIN
(
SELECT Column4, ROW_NUMBER() OVER (ORDER BY Column4) rn
FROM
(
SELECT DISTINCT Column4
FROM YourTable
) t1
) d4 ON(n = d4.rn)
LEFT JOIN
(
SELECT Column5, ROW_NUMBER() OVER (ORDER BY Column5) rn
FROM
(
SELECT DISTINCT Column5
FROM YourTable
) t1
) d5 ON(n = d5.rn)
Dynamic version:
DECLARE #TableName sysname = 'YourTableName'
DECLARE #Sql nvarchar(max) =
'
DECLARE #Count int
select #Count = COUNT(1)
FROM '+ #TableName +'
;with tally as (
select 1 as n
union all
select n + 1
from tally
where n < #Count
)
SELECT '
SELECT #Sql = #Sql + Column_Name +','
FROM INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME = #TableName
SELECT #Sql = LEFT(#Sql, LEN(#Sql) - 1) + ' FROM tally t'
SELECT #Sql = #Sql + ' LEFT JOIN (SELECT '+ Column_Name +', ROW_NUMBER() OVER (ORDER BY ' + Column_Name +') rn
FROM
(
SELECT DISTINCT '+ Column_Name +' FROM '+ #TableName +') t
) c_'+ Column_Name + ' ON(n = c_'+ Column_Name + '.rn)'
FROM INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME = #TableName
EXEC(#Sql)
Update
Tested on a table with 22 columns and 47,000 rows, my suggestion took 46 seconds when using a proper tally table. on Sql server 2014.
I was surprised - I thought it would take at least 2-3 minutes.
Here's a dynamic set I was working on. I'm running out of time so it's not cleaned up, and it determines the dynamic row numbers by the max number of rows in the table as a whole, meaning that if you have any duplicates in any column at all, you'll be left with rows where every single column is null.
But other than that, this should work perfectly fine, and the script contains the necessary info showing you how to concatenate a final "WHERE S1.COLNAME IS NOT NULL AND S2.COLNAME IS NOT NULL AND .." filter to the result table, to eliminate those full-null rows.
Other than that, here's the script. It's gonna be heavy, obviously, so I included a (nolock) hint in it, and a "WHERE ColName is not null" to remove useless results.
Try this on a smaller table and see it work.
/*
Set your table and schema on #MYTABLE and #MYSCHEMA variables.
*/
SET NOCOUNT ON
DECLARE #MYTABLE SYSNAME = 'Mytablename here'
, #MYSCHEMA sysname = 'dbo'
DECLARE #SQL NVARCHAR(MAX) = '', #COLNAME sysname = '', #MYCOLS NVARCHAR(max) = ''
DECLARE #COL_NOW INT = 1, #COL_MAX INT =
(SELECT COUNT(*)
FROM sys.columns
WHERE object_id = (SELECT object_id FROM sys.tables where name = #MYTABLE and SCHEMA_NAME(schema_id) = #MYSCHEMA))
SELECT #COLNAME = name
FROM sys.columns
WHERE column_id = 1
and object_id = (SELECT object_id FROM sys.tables where name = #MYTABLE and SCHEMA_NAME(schema_id) = #MYSCHEMA)
SET #SQL = 'FROM
(SELECT ROW_NUMBER() OVER (ORDER BY '+#COLNAME+' ASC) RN
FROM '+#MYSCHEMA+'.'+#MYTABLE+' (nolock)) S'
WHILE #COL_NOW <= #COL_MAX
BEGIN
SELECT #COLNAME = name
FROM sys.columns
WHERE column_id = #COL_NOW
and object_id = (SELECT object_id FROM sys.tables where name = #MYTABLE and SCHEMA_NAME(schema_id) = #MYSCHEMA)
SELECT #SQL = #SQL+'
FULL JOIN
(SELECT DISTINCT DENSE_RANK() OVER (ORDER BY '+#COLNAME+' ASC) RN, '+#COLNAME+'
FROM '+#MYSCHEMA+'.'+#MYTABLE+' (nolock)
WHERE '+#COLNAME+' IS NOT NULL) S'+CAST(#COL_NOW AS NVARCHAR(25))+' ON S'+CAST(#COL_NOW AS NVARCHAR(25))+'.RN = S.RN'
IF #COL_NOW = 1
SELECT #MYCOLS = #MYCOLS+' S'+CAST(#COL_NOW AS NVARCHAR(25))+'.'+#COLNAME
ELSE
SELECT #MYCOLS = #MYCOLS+', S'+CAST(#COL_NOW AS NVARCHAR(25))+'.'+#COLNAME
SET #COL_NOW = #COL_NOW+1
END
SELECT #SQL = 'SELECT'+#MYCOLS+'
'+#SQL+'
ORDER BY S1.RN ASC';
--PRINT(#SQL); -- To check resulting dynamic SQL without executing it (Warning, print will only show first 8k characters)
EXEC sp_executesql #SQL;
GO
Related
I have four tables like table 1, table 2, table 3 and table 4 and columns are
t1: 1,a, e
t2: 1,b, f
t3: 1,c, g
t4: 1,d, h
These table are in temporary table
I need output like this
1,a,b,c,d,e,f,h
Dynamically
you can use this below logic-
SELECT
MAX(c1),MAX(c2),MAX(c3),MAX(c4),
MAX(c5),MAX(c6),MAX(c7),MAX(c8)
FROM
(
SELECT c1,c2,NULL c3,NULL c4,NULL c5,NULL c6,NULL c7,NULL c8 FROM t1
UNION ALL
SELECT NULL c1,NULL c2,c3,c4,NULL c5,NULL c6,NULL c7,NULL c8 FROM t2
UNION ALL
SELECT NULL c1,NULL c2,NULL c3,NULL c4,c5,c6,NULL c7,NULL c8 FROM t3
UNION ALL
SELECT NULL c1,NULL c2,NULL c3,NULL c4,NULL c5,NULL c6,c7,c8 FROM t4
)A
You can use the INFORMATION_SCHEMA to find the table structure and generate a dynamic query as per below.
FOR DB Tables
Create TABLE Table1
(
Column1 int,
Column5 int,
)
Create TABLE Table2
(
Column2 int,
Column6 int,
)
Create TABLE Table3
(
Column3 int,
Column7 int,
)
Create TABLE Table4
(
Column4 int,
Column8 int,
)
DECLARE #Columns VARCHAR(MAX)
,#From VARCHAR(MAX)
SELECT #Columns = (SELECT
',' + C.TABLE_NAME + '.' + C.COLUMN_NAME
FROM
INFORMATION_SCHEMA.COLUMNS AS C
WHERE
C.TABLE_NAME like 'Table%'
ORDER BY
C.COLUMN_NAME
FOR XML PATH (''))
SELECT #From = (SELECT
'CROSS JOIN ' + T.TABLE_NAME + ' '
FROM
INFORMATION_SCHEMA.Tables AS T
WHERE
T.TABLE_NAME like 'Table%'
ORDER BY
T.TABLE_NAME
FOR XML PATH (''))
DECLARE #FullQuery VARCHAR(MAX) = 'SELECT '+ substring(#Columns,2,LEN(#Columns)-1) + ' FROM '
+ substring(#From,12,LEN(#From)-11)
EXEC (#FullQuery)
DROP TABLE table1
DROP TABLE table2
DROP TABLE table3
DROP TABLE table4
For Temp Tables
Create TABLE #Table1
(
Column1 int,
Column5 int,
)
Create TABLE #Table2
(
Column2 int,
Column6 int,
)
Create TABLE #Table3
(
Column3 int,
Column7 int,
)
Create TABLE #Table4
(
Column4 int,
Column8 int,
)
DECLARE #Columns VARCHAR(MAX)
,#From VARCHAR(MAX)
SELECT #Columns = (SELECT
',' + SUBSTRING(T.name,1,CHARINDEX('_', T.name) - 1) + '.' + C.name
FROM
Tempdb.Sys.Tables AS T
INNER JOIN Tempdb.Sys.Columns AS C ON C.object_id = T.object_id
where
T.name like '#Table%'
ORDER BY
C.name
FOR XML PATH (''))
SELECT #From = (SELECT
'CROSS JOIN ' + SUBSTRING(T.name,1,CHARINDEX('_', T.name) - 1) + ' '
FROM
Tempdb.Sys.Tables AS T
where
T.name like '#Table%'
ORDER BY
T.name
FOR XML PATH (''))
DECLARE #FullQuery VARCHAR(MAX) = 'SELECT DISTINCT '+ substring(#Columns,2,LEN(#Columns)-1) + ' FROM '
+ substring(#From,12,LEN(#From)-11)
EXEC (#FullQuery)
DROP TABLE #table1
DROP TABLE #table2
DROP TABLE #table3
DROP TABLE #table4
You need to control the table and columns based on your need.
From what I understand, your data is something like:
Your final output is:
One solution I propose is the following:
Get the data from all tables (table1,table2,table3,table4) in respective temp tables.
For eg. the corresponding temp table for table1 is:
Create a table to consolidate all the data from all the 4 temp tables created in the previous step.
create table consolidated_data
(
dataVal VARCHAR(1000)
,colName VARCHAR(1000)
)
insert into consolidated_data
select * from #temp1
union all
select * from #temp2
union all
select * from #temp3
union all
select * from #temp4
Write dynamic pivot sql query. This is a good reference.
begin
declare #query nvarchar(max);
declare #cols nvarchar(max);
with cte as (select distinct colHeaderId, colHeaderName from [dbo].ColHeader)
select #cols = STUFF((SELECT ',' + QUOTENAME(colheaderName)
FROM cte
order by colHeaderId
FOR XML PATH(''), TYPE
).value('.', 'NVARCHAR(MAX)')
, 1, 1, '');
SELECT #query =
'select * from (
select d.dataVal as dataVal,row_number() OVER(partition by d.colName order
by d.colName) as rownum
,c.colHeaderName as colHeaderName from consolidated_data d left outer join
ColHeader c on d.colName = c.colHeaderName) as t
PIVOT
(
MAX(dataVal)
FOR colheaderName IN( ' + #cols + ' )' +
' ) AS p ; ';
execute(#query);
end
I have database with 7,000 tables, most of these have a column DataAreaId, but not all tables, since some are global.
I would like to list all tables that have the column DataAreaId and their row count where the column DataAreaId contains "FR".
So for one table it would be:
SELECT COUNT(*)
FROM Table
WHERE DataAreaId = 'FR'
Any suggestions?
You can use the following
CREATE TABLE T1(
Dataareaid VARCHAR(45)
);
CREATE TABLE T2(
Dataareaid VARCHAR(45)
);
INSERT INTO T1 VALUES
('FR'),
('ALG'),
('FR');
DECLARE #SQL NVARCHAR(max) = N'';
SELECT #SQL = (
SELECT CONCAT(
N'UNION ALL ',
N'SELECT ''',
t.name,
N''' AS TableName, ',
N'Cnt = (SELECT COUNT(1)',
' FROM ',
QUOTENAME(t.name),
N' WHERE [Dataareaid] = ''FR'')'
)
FROM sys.columns c
JOIN sys.tables t ON c.object_id = t.object_id
WHERE c.name = 'Dataareaid'
FOR XML PATH('')
)
SET #SQL = STUFF(#SQL, 1, 10, N'');
EXEC sp_executesql #SQL;
Returns:
+-----------+-----+
| TableName | Cnt |
+-----------+-----+
| T1 | 2 |
| T2 | 0 |
+-----------+-----+
Live Demo
One way to do this is to query all tables containing that column, and build a query statement for each
select 'union all select ' + QUOTENAME(t.name,N'''') + ', count(1) from ' + t.name + ' where Dataareaid = ''FR'''
from sys.columns c
join sys.tables t ON c.object_id = t.object_id
where c.name = 'Dataareaid'
each row would look like this
union all select 'SomeTable', count(1) from SomeTable where Dataareaid = 'FR'
Now just put all statements together and remove the first union all
My answer gets the column metadata, executes the statement in a loop and publishes the results to a table variable:
if object_id('tempdb..#LoopList') is not null
drop table #LoopList
select
s.[name] as SchemaName
,t.[name] as TableName
,row_number() over (order by t.[object_id] asc) as RowNumber
into #LoopList
from sys.columns as c
inner join sys.tables as t
on c.[object_id] = t.[object_id]
inner join sys.schemas as s
on t.[schema_id] = s.[schema_id]
where c.[name] = 'Dataareaid'
declare
#a int = 1
,#b int = (select max(RowNumber) from #LoopList)
,#c nvarchar(max)
,#d nvarchar(max)
,#e int
declare #count table (RowCounter int)
declare #resultsTable table (TableName nvarchar(500), RowCounter int)
while #a <= #b
begin
delete from #count
set #c = concat ((select quotename(SchemaName) from #LoopList where RowNumber = #a)
,'.'
,(select quotename(TableName) from #LoopList where RowNumber = #a)
)
set #d = concat(N'select count(*) from '
,#c
,N' where Dataareaid = ''FR'''
)
insert into #count (
RowCounter
)
exec sp_executesql #d
set #e = (select top 1 RowCounter from #count)
insert into #resultsTable (
TableName
,RowCounter
)
values (#c,#e)
set #a += 1;
end
select * from #resultsTable
I have a table with 700 columns. I am trying to get a list of distinct values for each column and their count. I am using the below query to get the result for 1 column
Select distinct col1, count(*) from MyTable group by 1.
Result:
col1 count(*)
a 10
b 20
c 40
How can I get the result for all columns using a single query in the most optimal way?
The basic query is:
select col001, count(*) from MyTable group by col001 union all
select col002, count(*) from MyTable group by col002 union all
. . .
select col700, count(*) from MyTable group by col700 ;
Not pleasant, but that is basically the query you need to run. SQL doesn't really do multiple independent aggregations more efficiently than doing them separately (even using grouping sets, in my experience).
You can construct the query. One way is to run something like this:
select replace(replace('select [col], count(*) as cnt from [tab] group by [col] union all ',
'[tab]', table_name
), '[col]', column_name
)
from information_schema.columns
where table_name = 'mytable' and table_schema = ??;
You can then copy the generated SQL (removing the final union all) and run it.
Note: That above is generic; the exact code might differ by database.
A list with distinct values for each column is impossible. What if column A has 5 distinct values and column B has 7. What would your list look like?
The other question is easier, but as #Gordon Linoff states, takes 2 steps. Elaborating on his answer, for MS SQL:
select replace(replace(' count(distinct([col])) as [col],',
'[tab]', table_name
), '[col]', column_name
)
from information_schema.columns
where table_name = 'your_table';
Copy the results and paste them in a new query window between.
SELECT
[[results query 1]]
FROM your_table
Remember to delete the last ',' from query 1 results.
Replace [table name] with the table you need counts for.
DECLARE #table varchar(100) = '[table name]'
DECLARE #i INT = 1, #cntOUT int, #SQL nvarchar(500) = ''
DECLARE #ParmDef nvarchar(500) = N'#cnt int OUTPUT';
SELECT column_id, name, 0 as record_count
INTO #T1
FROM sys.all_columns c
WHERE c.object_id = (SELECT object_id FROM sys.objects WHERE name = #table AND type = 'U')
WHILE #i <= (SELECT MAX(column_id) FROM #T1)
BEGIN
SELECT #SQL = 'SELECT #cnt = COUNT(DISTINCT ' + name + ') FROM ' + #table + ';'
FROM #T1 WHERE column_id = #i;
EXECUTE sp_executesql #stmt = #SQL, #ParmDefinition = #ParmDef, #cnt = #cntOUT OUTPUT;
UPDATE #T1 SET record_count = #cntOUT WHERE column_id = #i
SET #i = #i + 1
END
SELECT * FROM #T1
--DROP TABLE #T1
I've got this dynamically created mess that essentially takes all fields in a table and compares two records against each other:
DECLARE #ID1 AS VarChar(3)
DECLARE #ID2 AS VarChar(3)
Set #ID1 = '42'
Set #ID2 = '600'
-- Where clause params
DECLARE #whereClauseParam VARCHAR(MAX) = '['+#ID1+'] <> ['+#ID2+']'
--***************************************--
--******** tblSQLAdminInventory ********--
--***************************************--
--Get the Fields required for the initial pivot
DECLARE #AIFields VARCHAR(MAX)= '';
DECLARE #AIFields2 VARCHAR(MAX)= '';
SELECT #AIFields+=QUOTENAME(t.name)+', '
FROM sys.columns AS t
WHERE t.object_id = OBJECT_ID('tblSQLAdminInventory')
AND t.name <> 'TransID'
--AND t.system_type_id = '56';
SELECT #AIFields2+='Convert(VarChar(250), '+QUOTENAME(t.name)+') AS '+ QUOTENAME(t.name) +', '
FROM sys.columns AS t
WHERE t.object_id = OBJECT_ID('tblSQLAdminInventory')
AND t.name <> 'TransID'
--AND t.system_type_id = '56';
--56 (Int)
--61 (DateTime)
--104 (Bit)
--167 (VarChar)
--231 (NVarChar)
-- Get the KeyId's with alias added
DECLARE #AIkeyIDs VARCHAR(MAX),
#AIkeyIDs1 VARCHAR(MAX);
SELECT #AIkeyIDs = COALESCE(#AIkeyIDs + ',','') + QUOTENAME(t.TransID) + ' AS [KeyID_' + CAST(t.TransID AS VARCHAR(10)) + ']',
#AIkeyIDs1 = COALESCE(#AIkeyIDs1 + ',','') + QUOTENAME(t.TransID)
FROM tblSQLAdminInventory AS t
WHERE TransID IN (#ID1, #ID2);
--Generate Dynamic SQL
DECLARE #AISQL2 VARCHAR(MAX)= 'SELECT Value AS FieldName, ';
SELECT #AISQL2+=#AIkeyIDs+'
FROM
(SELECT TransID, Value, FieldName
FROM
(SELECT TransID, '+SUBSTRING(#AIFields2, 1, LEN(#AIFields2)-1)+'
FROM tblSQLAdminInventory) p
UNPIVOT
(FieldName FOR Value IN
('+SUBSTRING(#AIFields, 1, LEN(#AIFields)-1)+')
)AS unpvt) AS SourceTable
PIVOT
(
MAX(FieldName)
FOR TransID IN ('+#AIkeyIDs1+')
) AS PivotTable
WHERE '+#whereClauseParam
EXECUTE(#AISQL2);
The problem is, it won't seem to let me put the results in a temp table. I tried using this code but it keeps telling me the #Temp1 object doesn't exist:
SELECT #AISQL2+=#AIkeyIDs+'
INTO #Temp1
FROM
(SELECT TransID, Value, FieldName
FROM
(SELECT TransID, '+SUBSTRING(#AIFields2, 1, LEN(#AIFields2)-1)+'
FROM tblSQLAdminInventory) p
UNPIVOT
(FieldName FOR Value IN
('+SUBSTRING(#AIFields, 1, LEN(#AIFields)-1)+')
)AS unpvt) AS SourceTable
PIVOT
(
MAX(FieldName)
FOR TransID IN ('+#AIkeyIDs1+')
) AS PivotTable
WHERE '+#whereClauseParam
What am I doing wrong?
You're using dynamic SQL. The EXECUTE statement starts a whole new scope and that temporary table isn't available in that scope.
There are several work-arounds, like using a permanent table that you clear out or using a global temporary table, but they all have their own pitfalls.
I'm trying to write a query that will produce a very small sample of data from each column of a table, in which the sample is made up of the top 3 most common values. This particular problem is part of a bigger task, which is to write scripts that can characterize a database and its tables, its data integrity, and also quickly survey common values in the table on a per-column basis. Think of this as an automated "analysis" of a table.
On a single column basis, I do this already by simply calculating the frequency of values and then sorting by frequency. If I had a column called "color" and all colors were in it, and it just so happened that the color "blue" was in most rows, then the top 1 most frequently occurring value would be "blue". In SQL that is easy to calculate.
However, I'm not sure how I would do this over multiple columns.
Currently, when I do a calculation over all columns of a table, I perform the following type of query:
USE database;
DECLARE #t nvarchar(max)
SET #t = N'SELECT '
SELECT #t = #t + 'count(DISTINCT CAST(' + c.name + ' as varchar(max))) "' + c.name + '",'
FROM sys.columns c
WHERE c.object_id = object_id('table');
SET #t = SUBSTRING(#t, 1, LEN(#t) - 1) + ' FROM table;'
EXEC sp_executesql #t
However, its not entirely clear to me how I would do that here.
(Sidenote:columns that are of type text, ntext, and image, since those would cause errors while counting distinct values, but i'm less concerned about solving that)
But the problem of getting top three most frequent values per column has got me absolutely stumped.
Ideally, I'd like to end up with something like this:
Col1 Col2 Col3 Col4 Col5
---------------------------------------------------------------------
1,2,3 red,blue,green 29,17,0 c,d,j nevada,california,utah
I hacked this together, but it seems to work:
I cant help but think I should be using RANK().
USE <DB>;
DECLARE #query nvarchar(max)
DECLARE #column nvarchar(max)
DECLARE #table nvarchar(max)
DECLARE #i INT = 1
DECLARE #maxi INT = 10
DECLARE #target NVARCHAR(MAX) = <table>
declare #stage TABLE (i int IDENTITY(1,1), col nvarchar(max), tbl nvarchar(max))
declare #results table (ColumnName nvarchar(max), ColumnValue nvarchar(max), ColumnCount int, TableName NVARCHAR(MAX))
insert into #stage
select c.name, o.name
from sys.columns c
join sys.objects o on o.object_id=c.object_id and o.type = 'u'
and c.system_type_id IN (select system_type_id from sys.types where [name] not in ('text','ntext','image'))
and o.name like #target
SET #maxi = (select max(i) from #stage)
while #i <= #maxi
BEGIN
set #column = (select col from #stage where i = #i)
set #table = (select tbl from #stage where i = #i)
SET #query = N'SELECT ' +''''+#column+''''+' , '+ #column
SELECT #query = #query + ', COUNT( ' + #column + ' ) as count' + #column + ' , ''' + #table + ''' as tablename'
select #query = #query + ' from ' + #table + ' group by ' + #column
--Select #query
insert into #results
EXEC sp_executesql #query
SET #i = #i + 1
END
select * from #results
; with cte as (
select *, ROW_NUMBER() over (partition by Columnname order by ColumnCount desc) as rn from #results
)
select * from cte where rn <=3
Start with this SQL Statement builder, and modify it to suit your liking:
EDIT Added Order by Desc
With ColumnSet As
(
Select TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME
From INFORMATION_SCHEMA.COLUMNS
Where 1=1
And TABLE_NAME IN ('Table1')
And COLUMN_NAME IN ('Column1', 'Column2')
)
Select 'Select Top 3 ' + COLUMN_NAME + ', Count (*) NumInstances From ' + TABLE_SCHEMA + '.'+ TABLE_NAME + ' Group By ' + COLUMN_NAME + ' Order by Count (*) Desc'
From ColumnSet