Selecting columns from a query - sql

I'm using a request to get a collection of columns name:
SELECT COLUMN_NAME
FROM INFORMATION_SCHEMA.COLUMNS
WHERE [...]
From this collection, I'd like to count every not null, not empty value from the original table group by column name.
Let's say I have a table containing
COL1 | COL2 | COL3
------------------
VAL1 | VAL2 | NULL
VAL3 | | VAL4
VAL5 | |
I'm looking for a request to get:
COL1 | 3
COL2 | 1
COL2 | 1
It's for analytics purpose.
Thanks for your help!

Here is a simple process. Run the following query:
SELECT 'SELECT ''' + COLUMN_NAME + ''', COUNT(['+COLUMN_NAME']) as NotNull FROM [' +SCHEMA_NAME+ '].['+TABLE_NAME+ '] union all '
FROM INFORMATION_SCHEMA.COLUMNS
WHERE [...]
Copy the results into a query window, remove the final union all, and run the query.

The below code seems to work for your issue
create table sample
(
col1 varchar(10),
col2 varchar(10),
col3 varchar(10)
)
INSERT INTO sample (COL1,COL2,COL3) VALUES ('VAL1 ',' VAL2 ',NULL);
INSERT INTO sample (COL1,COL2,COL3) VALUES ('VAL3 ',' ',' VAL4');
INSERT INTO sample (COL1,COL2,COL3) VALUES ('VAL5 ',' ',' ');
DECLARE #cols1 NVARCHAR(MAX);
DECLARE #sql NVARCHAR(MAX);
SELECT #cols1 = STUFF((
SELECT ', COUNT(CASE WHEN len(['+ t1.NAME + '])!=0 THEN 1 END) AS ' + t1.name
FROM sys.columns AS t1
WHERE t1.object_id = OBJECT_ID('sample')
--ORDER BY ', COUNT([' + t1.name + ']) AS ' + t1.name
FOR XML PATH('')
), 1, 2, '');
SET #sql = '
SELECT ' + #cols1 + '
FROM sample
'
EXEC(#sql)

Hereis my little longer take on this:
declare #cols table (colID integer, colName varchar(50))
declare #results table (colName nvarchar(50), valueCount bigint)
-- table name
declare #tableName nvarchar(50) = 'INSERT TABLE NAME HERE'
-- select column names from table
insert into #cols
select column_id, name from sys.columns where object_id = object_id(#tableName) order by column_id
declare #currentColID int = 0
declare #currentName nvarchar(50) = ''
declare #currentCount bigint = 0
declare #sql nvarchar(max) -- where the dynamic sql will be stored
-- go through all columns
while (1 = 1)
begin
-- step by id
select top 1 #currentColID = c.colID, #currentName = c.colName from #cols c
where c.colid > #currentColID order by c.colID
if ##ROWCOUNT = 0 break;
-- dynamic query to get non-empty, not-null counts
select #sql = 'select #p1=COUNT(' + #currentName + ') from ' + #tableName +
' where ' + #currentName + ' is not null or LEN(' + #currentName + ') > 0'
exec sp_executesql #sql, N'#p1 bigint output', #p1 = #currentCount output
-- insert result to buffer
insert into #results values (#currentName, #currentCount)
end
-- print the buffer
select * from #results
Have fun :)

Related

SQL count distinct or not null for each column for many columns

I need to analyze a large table with hundreds of columns. A lot of columns are unused.
To investigate I could do something like
SELECT DISTINCT Column1
FROM myTable
or
WITH C AS
(
SELECT DISTINCT Column1
FROM MyTable
)
SELECT COUNT(*)
FROM C
Then I do the same for column2 and so on. However these queries only work for one column which is time consuming and does not give overview in one glance.
Any idea how to build such investigation query for all columns in one?
You need only 1 query where you have to list all the columns of the table:
SELECT COUNT(DISTINCT Column1) column1_count,
COUNT(DISTINCT Column2) column2_count,
COUNT(DISTINCT Column3) column3_count
.....................................
FROM MyTable;
For local purposes only, you can make it dynamic like this:
Get the columns of the table
the query is created as the colleagues did and then it is executed with the EXEC()
DECLARE #columns as Table(RowId INT IDENTITY(1,1), ColumnName nVarchar(50))
DECLARE #ii int = 0
DECLARE #max int = 0
DECLARE #sqlQuery nVarchar(MAX)
INSERT INTO #columns
SELECT COLUMN_NAME
FROM INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME = N'Customer'
SET #sqlQuery = 'SELECT '
SELECT #max = COUNT(*) FROM #columns
WHILE #ii <= #max
BEGIN
SELECT #sqlQuery = CONCAT(#sqlQuery,'COUNT(DISTINCT ',ColumnName,') ',LOWER(ColumnName),'_count, ')
FROM #columns
WHERE RowId = #ii
SET #ii = #ii + 1
END
SELECT #sqlQuery = CONCAT(#sqlQuery,'FROM Customer')
SELECT #sqlQuery = REPLACE(#sqlQuery,', FROM',' FROM')
select #sqlQuery
EXEC (#sqlQuery)
You should flesh out your requirement a bit more. If all you want to know is if a column contains only NULLs, you'll want to check for max(ColumnName) is null
declare #sql table (id int identity(1,1), QueryString nvarchar(max))
create table ##emptyColumns (emptyColumn nvarchar(128))
declare #i int = 0
declare #iMax int
declare #runthis nvarchar(max)
insert #sql
select 'select ''' + QUOTENAME(s.name) + '.' + QUOTENAME(o.name) + quotename(c.name) + ''' as ''column''
from ' + QUOTENAME(s.name) + '.' + QUOTENAME(o.name) + '
having max(' + c.name + ') is null'
from sys.sysobjects o
inner join sys.syscolumns c on c.id = o.id
inner join sys.schemas s on s.schema_id = o.uid
where o.type = 'U'
order by s.name
, o.name
, c.colorder
select #iMax = count(*)
from #sql
print #iMax
while #i < #iMax
begin
set #i = #i + 1
select #runthis = 'insert into ##emptyColumns
' + QueryString
from #sql
where id = #i
execute sp_executesql #runthis
end
select *
from ##emptyColumns
drop table ##emptyColumns
One further option you might consider:
declare #sql nvarchar(max)
select #sql = isnull(#sql + ' union all ', '') + 'select ''' + COLUMN_NAME + ''',
sum(case when ' + COLUMN_NAME + ' is null then 1 else 0 end) as null_values,
count(distinct ' + COLUMN_NAME + ') as count_distinct
from ' + TABLE_SCHEMA + '.' + TABLE_NAME + '
'
from information_schema.columns
where TABLE_SCHEMA = 'MySchema' and TABLE_NAME = 'MyTable'
exec (#sql)
If you had very big tables with large numbers of columns and were only interested in empty columns you could look into something like checksum_agg(checksum(column_name)). It may help improve performance.
You'd need to be wary of column data types, as they are not all compatible with distinct.

Get top three most common values from every column in a table

I'm trying to write a query that will produce a very small sample of data from each column of a table, in which the sample is made up of the top 3 most common values. This particular problem is part of a bigger task, which is to write scripts that can characterize a database and its tables, its data integrity, and also quickly survey common values in the table on a per-column basis. Think of this as an automated "analysis" of a table.
On a single column basis, I do this already by simply calculating the frequency of values and then sorting by frequency. If I had a column called "color" and all colors were in it, and it just so happened that the color "blue" was in most rows, then the top 1 most frequently occurring value would be "blue". In SQL that is easy to calculate.
However, I'm not sure how I would do this over multiple columns.
Currently, when I do a calculation over all columns of a table, I perform the following type of query:
USE database;
DECLARE #t nvarchar(max)
SET #t = N'SELECT '
SELECT #t = #t + 'count(DISTINCT CAST(' + c.name + ' as varchar(max))) "' + c.name + '",'
FROM sys.columns c
WHERE c.object_id = object_id('table');
SET #t = SUBSTRING(#t, 1, LEN(#t) - 1) + ' FROM table;'
EXEC sp_executesql #t
However, its not entirely clear to me how I would do that here.
(Sidenote:columns that are of type text, ntext, and image, since those would cause errors while counting distinct values, but i'm less concerned about solving that)
But the problem of getting top three most frequent values per column has got me absolutely stumped.
Ideally, I'd like to end up with something like this:
Col1 Col2 Col3 Col4 Col5
---------------------------------------------------------------------
1,2,3 red,blue,green 29,17,0 c,d,j nevada,california,utah
I hacked this together, but it seems to work:
I cant help but think I should be using RANK().
USE <DB>;
DECLARE #query nvarchar(max)
DECLARE #column nvarchar(max)
DECLARE #table nvarchar(max)
DECLARE #i INT = 1
DECLARE #maxi INT = 10
DECLARE #target NVARCHAR(MAX) = <table>
declare #stage TABLE (i int IDENTITY(1,1), col nvarchar(max), tbl nvarchar(max))
declare #results table (ColumnName nvarchar(max), ColumnValue nvarchar(max), ColumnCount int, TableName NVARCHAR(MAX))
insert into #stage
select c.name, o.name
from sys.columns c
join sys.objects o on o.object_id=c.object_id and o.type = 'u'
and c.system_type_id IN (select system_type_id from sys.types where [name] not in ('text','ntext','image'))
and o.name like #target
SET #maxi = (select max(i) from #stage)
while #i <= #maxi
BEGIN
set #column = (select col from #stage where i = #i)
set #table = (select tbl from #stage where i = #i)
SET #query = N'SELECT ' +''''+#column+''''+' , '+ #column
SELECT #query = #query + ', COUNT( ' + #column + ' ) as count' + #column + ' , ''' + #table + ''' as tablename'
select #query = #query + ' from ' + #table + ' group by ' + #column
--Select #query
insert into #results
EXEC sp_executesql #query
SET #i = #i + 1
END
select * from #results
; with cte as (
select *, ROW_NUMBER() over (partition by Columnname order by ColumnCount desc) as rn from #results
)
select * from cte where rn <=3
Start with this SQL Statement builder, and modify it to suit your liking:
EDIT Added Order by Desc
With ColumnSet As
(
Select TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME
From INFORMATION_SCHEMA.COLUMNS
Where 1=1
And TABLE_NAME IN ('Table1')
And COLUMN_NAME IN ('Column1', 'Column2')
)
Select 'Select Top 3 ' + COLUMN_NAME + ', Count (*) NumInstances From ' + TABLE_SCHEMA + '.'+ TABLE_NAME + ' Group By ' + COLUMN_NAME + ' Order by Count (*) Desc'
From ColumnSet

Getting value from different columns and storing in a string with comma separated in Sql server

My requirement is that i have Table with 5 columns. What I want is to fetch the values from the row and want to store all 5 values in variable with comma separated.
table Name: table1
col1 col2 col3 col4 col5
1 2 3 4 5
Result should be
1,2,3,4,5
Note: I dont want to mention column name like if we use Concat function there we need to mention columns, but here my requirement is I'll only have the table name
Use the table : information_schema.columns
DECLARE #col1 varchar(10)
DECLARE #col2 varchar(10)
DECLARE #col3 varchar(10)
DECLARE #col4 varchar(10)
DECLARE #col5 varchar(10)
SET #col1 = (select column_name from information_schema.columns where table_name = 'table1' and ordinal_position = '1')
SET #col2 = (select column_name from information_schema.columns where table_name = 'table1' and ordinal_position = '2')
SET #col3 = (select column_name from information_schema.columns where table_name = 'table1' and ordinal_position = '3')
SET #col4 = (select column_name from information_schema.columns where table_name = 'table1' and ordinal_position = '4')
SET #col5 = (select column_name from information_schema.columns where table_name = 'table1' and ordinal_position = '5')
DECLARE #sqlText nvarchar(1000);
SET #sqlText = N'SELECT ' + #col1 + ',' + #col2 + ',' + #col3 + ','+ #col4 + ','+ #col5 +' FROM dbo.table1'
Exec (#sqlText)
If you always have exactly 5 columns in your table, this would work.
Note this doesn't refer to the column names
;WITH cte(a,b,c,d,e) as
(
-- can test with this:
--SELECT * FROM (values(1,2,3,4,5)) x(col1,col2,col3,col4,col5)
SELECT * FROM table1
)
SELECT
concat(a, ',', b, ',', c, ',', d, ',', e)
FROM
cte
Result:
1,2,3,4,5
I am always hesitant about giving answers that suggest dynamic SQL, often when dynamic SQL is required, the problem is best solved outside of SQL. Erland Sommarskog has written pretty extensively on the subject - The Curse and Blessings of Dynamic SQL
Anyway, your problem can be solved using dynamic SQL. You can build your SQL using the system views, and concatenate the column name using SQL Server's XML extensions:
DECLARE #TableName SYSNAME = 'dbo.table1';
DECLARE #SQL NVARCHAR(MAX) =
'SELECT CONCAT(' + STUFF(( SELECT ','','',' + QUOTENAME(c.Name)
FROM sys.columns c
WHERE c.object_id = OBJECT_ID(#TableName, 'U')
ORDER BY Column_id
FOR XML PATH(''), TYPE).value('.', 'NVARCHAR(MAX)'), 1, 5, '') + ')
FROM ' + #TableName + ';';
EXECUTE sp_executesql #SQL;
So for this table:
CREATE TABLE dbo.Table1
(
Col1 VARCHAR(10),
Col2 VARCHAR(10),
Col3 INT,
Col4 INT,
Col5 VARCHAR(30)
);
The SQL that is generated is:
SELECT CONCAT([Col1],',',[Col2],',',[Col3],',',[Col4],',',[Col5])
FROM dbo.table1;
Just replace the name of your table for variable #tablename and you should be fine.
NULL values will be translated to a 'NULL' string
SET NOCOUNT ON
DECLARE #tablename nvarchar(50) = 'atable'
DECLARE #colList nvarchar(max)
DECLARE #sql nvarchar(max)
--Select list of colum names
select #colList = coalesce('CAST('+#Collist+'+'', '' ' +'+ CAST(','')+name+' as nvarchar)'
from
(
select name
from sys.columns
where [object_id] = OBJECT_ID(#tablename)
) sub
SET #colList = REPLACE(REPLACE(#colList, 'CAST(', 'ISNULL(CAST('), 'nvarchar)', 'nvarchar), ''NULL'')')
SET #sql = 'SELECT '+#colList+' FROM '+#tablename
EXEC(#sql)
Example:
This table (atable)
col1 col2
-------------------------------------------------- -----------
foo 1
bar 2
abc 3
def 4
ghi 5
yep NULL
will be transformed to
--------------------------------------------------------------
foo, 1
bar, 2
abc, 3
def, 4
ghi, 5
yep, NULL
returns 1 result per row of the original table.

Find columns that contain only zeros

I'm working with SQL Server 2008. I have a list of column names on a table and I'd like to know how to use SQL to return the names of those columns which contain nothing but zero or NULL values.
declare #T table
(
Col1 int,
Col2 int,
Col3 int,
Col4 int
)
insert into #T values
(1, 0 , null, null),
(0, null, 0 , 1)
select U.ColName
from
(
select count(nullif(Col1, 0)) as Col1,
count(nullif(Col2, 0)) as Col2,
count(nullif(Col3, 0)) as Col3,
count(nullif(Col4, 0)) as Col4
from #T
) as T
unpivot
(C for ColName in (Col1, Col2, Col3, Col4)) as U
where U.C = 0
Result:
ColName
----------
Col2
Col3
The idea behind this is to count the non null values and only keep those with a count of 0.
COUNT will only count non null values.
NULLIF(ColX, 0) will make all 0 into null.
The inner query returns one row with four columns. UNPIVOT will turn it around so you have two columns and four rows.
Finally where U.C = 0 makes sure that you only get the columns that has no values other than null or 0.
Here is a brute force way, since you know all the column names.
CREATE TABLE dbo.splunge
(
a INT,
b INT,
c INT,
d INT
);
INSERT dbo.splunge VALUES (0,0,1,-1), (0,NULL,0,0), (0,0,0,NULL);
SELECT
cols = STUFF(
CASE WHEN MIN(COALESCE(a,0)) = MAX(COALESCE(a,0)) THEN ',a' ELSE '' END
+ CASE WHEN MIN(COALESCE(b,0)) = MAX(COALESCE(b,0)) THEN ',b' ELSE '' END
+ CASE WHEN MIN(COALESCE(c,0)) = MAX(COALESCE(c,0)) THEN ',c' ELSE '' END
+ CASE WHEN MIN(COALESCE(d,0)) = MAX(COALESCE(d,0)) THEN ',d' ELSE '' END,
1, 1, '')
FROM dbo.splunge;
-- result:
-- a,b
GO
DROP TABLE dbo.splunge;
You could probably generate much of this script instead of doing it manually, assuming you know the naming scheme or data type of the columns you want (or just by leaving off the where clause entirely and removing the columns you don't want manually).
SELECT CHAR(13) + CHAR(10) + ' + CASE WHEN MIN(COALESCE(' + name + ',0)) = '
+ 'MAX(COALESCE(' + name + ',0)) THEN '',' + name + ''' ELSE '''' END'
FROM sys.columns
WHERE [object_id] = OBJECT_ID('dbo.splunge')
-- AND system_type_id = 56
-- AND name LIKE '%some pattern%'
;
The output will look like the middle of the first query, so you can copy & paste and then remove the first + and add the surrounding STUFF and query...
Here's a way that works for any table:
declare #tableName nvarchar(max) = N'myTable'
declare #columnName nvarchar(max)
create table #zeros (column_name varchar(max))
declare c cursor local forward_only read_only for
select column_name
from information_schema.COLUMNS WHERE table_name = #tableName
open c
fetch next from c into #columnName
while ##FETCH_STATUS = 0
begin
declare #retval int
declare #sql nvarchar(max) =
N'set #retval = (select count(*) from ' + #tableName + N' where coalesce(' + #columnName + N', 0) <> 0)'
exec sp_executesql #sql, N'#retval int out', #retval=#retval out
select #retval
if #retval = 0
begin
insert into #zeros (column_name) values (#columnName)
end
fetch next from c into #columnName
end
close c
deallocate c
select * from #zeros
drop table #zeros

Select all columns except those with only null values

Is there a way to select the column names of a certain table except those columns with only null values without knowing how many columns the table have.
-------------------------
| col1 | col2 | col3 |
------------------------
| val1 | null | val2 |
| val1 | null | null |
| null | null | val2 |
-------------------------
Should result in:
------------------------------------
| cols_except_those_with_null_only |
-----------------------------------
| col1 |
| col3 |
------------------------------------
Thanks!
Create a stored procedure with following content:
create table #cols (colname varchar(255), nullCount int)
insert into #cols (colname)
select name from syscolumns where id = object_id('tblTest')
declare #c varchar(255)
declare curCols cursor for select colname from #cols
open curCols
fetch next from curCols into #c
while ##fetch_status = 0 begin
exec ('update #cols set nullCount = (select count(*) from tblTest where ' + #c + ' is not null) where colname = ''' + #c + '''')
fetch next from curCols into #c
end
close curCols
deallocate curCols
declare #rv table (cols_expect_those_with_null_only varchar(255))
insert into #rv (cols_expect_those_with_null_only)
select colname from #cols
where nullCount > 0
drop table #cols
select * from #rv
Try this, it's not the tidiest but will work, just set #Table to your table name.
DECLARE #Table AS VARCHAR(100)
SET #Table = 'Example'
DECLARE #TempColumn VARCHAR(100)
DECLARE #Sql NVARCHAR(300)
DECLARE #HasNoNulls INT
CREATE TABLE #Columns (
ColumnName VARCHAR(100)
)
DECLARE ColumnCursor CURSOR FOR
SELECT COLUMN_NAME
FROM INFORMATION_SCHEMA.Columns
WHERE TABLE_NAME = #Table
OPEN ColumnCursor
FETCH NEXT FROM ColumnCursor
INTO #TempColumn
WHILE ##FETCH_STATUS = 0
BEGIN
SET #SQL = 'SELECT #HasNoNullsOut = COUNT(*) FROM ' + #Table + ' WHERE ' + #TempColumn + ' IS NOT NULL'
PRINT #SQL
EXECUTE sp_executesql #SQL, N'#HasNoNullsOut int OUTPUT', #HasNoNullsOut=#HasNoNulls OUTPUT
IF #HasNoNulls > 0
BEGIN
INSERT INTO #Columns
VALUES(#TempColumn)
END
FETCH NEXT FROM ColumnCursor
INTO #TempColumn
END
CLOSE ColumnCursor
DEALLOCATE ColumnCursor
SELECT * FROM #Columns
DROP TABLE #Columns
With this structure you can do a query in a store procedure that allows you to ask for each column name of the table and if it has null values without caring how many columns your table has
SELECT a.[name] as 'Table',
b.[name] as 'Column'
FROM sysobjects a
INNER JOIN syscolumns b
ON a.[id] = b.[id]
where table='yourtable'