Get top three most common values from every column in a table - sql

I'm trying to write a query that will produce a very small sample of data from each column of a table, in which the sample is made up of the top 3 most common values. This particular problem is part of a bigger task, which is to write scripts that can characterize a database and its tables, its data integrity, and also quickly survey common values in the table on a per-column basis. Think of this as an automated "analysis" of a table.
On a single column basis, I do this already by simply calculating the frequency of values and then sorting by frequency. If I had a column called "color" and all colors were in it, and it just so happened that the color "blue" was in most rows, then the top 1 most frequently occurring value would be "blue". In SQL that is easy to calculate.
However, I'm not sure how I would do this over multiple columns.
Currently, when I do a calculation over all columns of a table, I perform the following type of query:
USE database;
DECLARE #t nvarchar(max)
SET #t = N'SELECT '
SELECT #t = #t + 'count(DISTINCT CAST(' + c.name + ' as varchar(max))) "' + c.name + '",'
FROM sys.columns c
WHERE c.object_id = object_id('table');
SET #t = SUBSTRING(#t, 1, LEN(#t) - 1) + ' FROM table;'
EXEC sp_executesql #t
However, its not entirely clear to me how I would do that here.
(Sidenote:columns that are of type text, ntext, and image, since those would cause errors while counting distinct values, but i'm less concerned about solving that)
But the problem of getting top three most frequent values per column has got me absolutely stumped.
Ideally, I'd like to end up with something like this:
Col1 Col2 Col3 Col4 Col5
---------------------------------------------------------------------
1,2,3 red,blue,green 29,17,0 c,d,j nevada,california,utah

I hacked this together, but it seems to work:
I cant help but think I should be using RANK().
USE <DB>;
DECLARE #query nvarchar(max)
DECLARE #column nvarchar(max)
DECLARE #table nvarchar(max)
DECLARE #i INT = 1
DECLARE #maxi INT = 10
DECLARE #target NVARCHAR(MAX) = <table>
declare #stage TABLE (i int IDENTITY(1,1), col nvarchar(max), tbl nvarchar(max))
declare #results table (ColumnName nvarchar(max), ColumnValue nvarchar(max), ColumnCount int, TableName NVARCHAR(MAX))
insert into #stage
select c.name, o.name
from sys.columns c
join sys.objects o on o.object_id=c.object_id and o.type = 'u'
and c.system_type_id IN (select system_type_id from sys.types where [name] not in ('text','ntext','image'))
and o.name like #target
SET #maxi = (select max(i) from #stage)
while #i <= #maxi
BEGIN
set #column = (select col from #stage where i = #i)
set #table = (select tbl from #stage where i = #i)
SET #query = N'SELECT ' +''''+#column+''''+' , '+ #column
SELECT #query = #query + ', COUNT( ' + #column + ' ) as count' + #column + ' , ''' + #table + ''' as tablename'
select #query = #query + ' from ' + #table + ' group by ' + #column
--Select #query
insert into #results
EXEC sp_executesql #query
SET #i = #i + 1
END
select * from #results
; with cte as (
select *, ROW_NUMBER() over (partition by Columnname order by ColumnCount desc) as rn from #results
)
select * from cte where rn <=3

Start with this SQL Statement builder, and modify it to suit your liking:
EDIT Added Order by Desc
With ColumnSet As
(
Select TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME
From INFORMATION_SCHEMA.COLUMNS
Where 1=1
And TABLE_NAME IN ('Table1')
And COLUMN_NAME IN ('Column1', 'Column2')
)
Select 'Select Top 3 ' + COLUMN_NAME + ', Count (*) NumInstances From ' + TABLE_SCHEMA + '.'+ TABLE_NAME + ' Group By ' + COLUMN_NAME + ' Order by Count (*) Desc'
From ColumnSet

Related

SQL count distinct or not null for each column for many columns

I need to analyze a large table with hundreds of columns. A lot of columns are unused.
To investigate I could do something like
SELECT DISTINCT Column1
FROM myTable
or
WITH C AS
(
SELECT DISTINCT Column1
FROM MyTable
)
SELECT COUNT(*)
FROM C
Then I do the same for column2 and so on. However these queries only work for one column which is time consuming and does not give overview in one glance.
Any idea how to build such investigation query for all columns in one?
You need only 1 query where you have to list all the columns of the table:
SELECT COUNT(DISTINCT Column1) column1_count,
COUNT(DISTINCT Column2) column2_count,
COUNT(DISTINCT Column3) column3_count
.....................................
FROM MyTable;
For local purposes only, you can make it dynamic like this:
Get the columns of the table
the query is created as the colleagues did and then it is executed with the EXEC()
DECLARE #columns as Table(RowId INT IDENTITY(1,1), ColumnName nVarchar(50))
DECLARE #ii int = 0
DECLARE #max int = 0
DECLARE #sqlQuery nVarchar(MAX)
INSERT INTO #columns
SELECT COLUMN_NAME
FROM INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME = N'Customer'
SET #sqlQuery = 'SELECT '
SELECT #max = COUNT(*) FROM #columns
WHILE #ii <= #max
BEGIN
SELECT #sqlQuery = CONCAT(#sqlQuery,'COUNT(DISTINCT ',ColumnName,') ',LOWER(ColumnName),'_count, ')
FROM #columns
WHERE RowId = #ii
SET #ii = #ii + 1
END
SELECT #sqlQuery = CONCAT(#sqlQuery,'FROM Customer')
SELECT #sqlQuery = REPLACE(#sqlQuery,', FROM',' FROM')
select #sqlQuery
EXEC (#sqlQuery)
You should flesh out your requirement a bit more. If all you want to know is if a column contains only NULLs, you'll want to check for max(ColumnName) is null
declare #sql table (id int identity(1,1), QueryString nvarchar(max))
create table ##emptyColumns (emptyColumn nvarchar(128))
declare #i int = 0
declare #iMax int
declare #runthis nvarchar(max)
insert #sql
select 'select ''' + QUOTENAME(s.name) + '.' + QUOTENAME(o.name) + quotename(c.name) + ''' as ''column''
from ' + QUOTENAME(s.name) + '.' + QUOTENAME(o.name) + '
having max(' + c.name + ') is null'
from sys.sysobjects o
inner join sys.syscolumns c on c.id = o.id
inner join sys.schemas s on s.schema_id = o.uid
where o.type = 'U'
order by s.name
, o.name
, c.colorder
select #iMax = count(*)
from #sql
print #iMax
while #i < #iMax
begin
set #i = #i + 1
select #runthis = 'insert into ##emptyColumns
' + QueryString
from #sql
where id = #i
execute sp_executesql #runthis
end
select *
from ##emptyColumns
drop table ##emptyColumns
One further option you might consider:
declare #sql nvarchar(max)
select #sql = isnull(#sql + ' union all ', '') + 'select ''' + COLUMN_NAME + ''',
sum(case when ' + COLUMN_NAME + ' is null then 1 else 0 end) as null_values,
count(distinct ' + COLUMN_NAME + ') as count_distinct
from ' + TABLE_SCHEMA + '.' + TABLE_NAME + '
'
from information_schema.columns
where TABLE_SCHEMA = 'MySchema' and TABLE_NAME = 'MyTable'
exec (#sql)
If you had very big tables with large numbers of columns and were only interested in empty columns you could look into something like checksum_agg(checksum(column_name)). It may help improve performance.
You'd need to be wary of column data types, as they are not all compatible with distinct.

Iterate through temporary table columns to select them

I have a final temporary table (#tempTable) with unknown columns number.
My final select is like this, it works :
SELECT temp.* FROM #tempTable temp
But instead of a '*' I would like to call each columns individually :
SELECT temp.col1, temp.col2 FROM #tempTable temp
To do so I need to iterate through my columns names and create a procedure, I tried something like this :
DECLARE #ColName VARCHAR(255)
SELECT #ColName = min(name) FROM tempdb.sys.columns
WHERE object_id = Object_id('tempdb..#TEMPTABLE');
WHILE #ColName is not null
BEGIN
-- i need to do it all in once and not each time....
declare #sql varchar(max) = 'SELECT tp.'+'#COlName'+'FROM #TEMPTABLE tp'
exec(#sql)
-- Increment the value, how to go to next column ?
select #ColName = min(name) FROM tempdb.sys.columns WHERE object_id =
Object_id('tempdb..#TEMPTABLE') > #ColName -- does not work because it is a string (column name)
END
Try this:
DECLARE #ColName VARCHAR(2000) = 'select '
SELECT #ColName = #ColName + ' temp.' + name + ',' FROM tempdb.sys.columns
WHERE object_id = Object_id('tempdb..#TEMPTABLE')
--delete last character, which is comma and append table name
#ColName = substring(#ColName, 1, LEN(#ColName) - 1) + ' from #TEMPTABLE temp'
exec(#ColName)
This query construct whole table list combined in select ... from ... statement. I increased size of the varchar variable, so it can accomodate long queries.
Also, IMO variable name such as #sql or #query would be more meaningful.
A set based approach
IF OBJECT_ID('tempdb..#TEMPTABLE','U') IS NOT NULL
DROP TABLE #TEMPTABLE;
CREATE TABLE #TEMPTABLE (
Id INT IDENTITY(1,1)
,Col1 INT
,Col2 BIGINT
,Col3 BIGINT
,Col4 DATETIME
,Col5 DATETIME
) ;
DECLARE #SQL NVARCHAR(MAX)
SELECT #SQL = N'SELECT ' + SUBSTRING((
SELECT N', temp.' + S.name
FROM
tempdb.sys.columns S
WHERE
S.object_id = OBJECT_ID('tempdb..#TEMPTABLE')
ORDER BY
S.column_id
FOR XML PATH('')
)
,2
,200000
) + N' FROM #TEMPTABLE temp'
EXEC sys.sp_executesql #SQL

How do I use loop to generate column names dynamically?

I have table sdata and it has 35 columns (id, name, TRx1, TRx2, TRx3, TRx4,..., TRx30, city, score, total)
I want to fetch data from the TRx1,...TRx30 columns.
Can I use loop here?
I did following code:
DECLARE #flag INT
DECLARE #sel varchar(255)
DECLARE #frm varchar(255)
SET #flag = 1;
SET #sel = 'select TRx';
SET #frm = ' from sdata';
exec(#sel +
(WHILE #flag <=5
#flag
SET #flag = #flag + 1)
+ #frm)
What wrong am I doing? And how can I resolve this?
If your table name is sdata, this code should work for you:
-- Grab the names of all the remaining columns
DECLARE #sql nvarchar(MAX);
DECLARE #columns nvarchar(MAX);
SELECT #columns = STUFF ( ( SELECT N'], [' + name
FROM sys.columns
WHERE object_id = (select top 1 object_id FROM sys.objects where name = 'sdata')
AND name LIKE 'TRx%' -- To limit which columns
ORDER BY column_id
FOR XML PATH('')), 1, 2, '') + ']';
PRINT #columns
SELECT #sql = 'SELECT ' + #columns + ' FROM sdata';
PRINT #sql;
EXEC (#sql);
Note I included PRINT statements so you could see what's going on. You might want to comment out the EXEC while testing.
This would be much easier to do by just copy/pasting the column names and changing them to be the correct one. However if you must do it this way, I do not advise using a loop at all. This method uses a tally table to generate the columns you want to select (in this example, columns 1 through 30, but that can be changed), then generates a dynamic SQL statement to execute against the SData table:
Declare #From Int = 1,
#To Int = 30,
#Sql NVarchar (Max)
Declare #Columns Table (Col Varchar (255))
;With Nums As
(
Select *
From (Values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9)) As V(N)
), Tally As
(
Select Row_Number() Over (Order By (Select Null)) As N
From Nums A --10
Cross Join Nums B --100
Cross Join Nums C --1000
)
Insert #Columns
Select 'TRx' + Cast(N As Varchar)
From Tally
Where N Between #From And #To
;With Cols As
(
Select (
Select QuoteName(Col) + ',' As [text()]
From #Columns
For Xml Path ('')
) As Cols
)
Select #Sql = 'Select ' + Left(Cols, Len(Cols) - 1) + ' From SData'
From Cols
--Select #Sql
Execute (#Sql)
Note: The --Select #Sql section is there to preview the generated query before executing it.
You can select the column names like this:
SELECT column_name
FROM information_schema.columns
WHERE table_name = 'my name here'

Loop through column values in Temp Table with N columns

I have a scenario inside a stored procedure where a temporary table will be generated with an unknown number of columns (Column1.....ColumnN). One of the columns will be the total\sum of few of the other columns.
The clients requirement is to show the percentage value of each column in comparison to the total column
(C1*100)/Total as P1 ,(C2*100)/Total as P2.....
I have really been unable to find a solution to this problem other than doing it in the front end using LINQ. I am wondering if there is any way to achieve this in SQL as that would give me performance benefits.The last thing I want to do is to loop through the rows and columns in C# which will hammer the server.
I had done, I just change according to you and you can read the comment for better understand. I feel the schemaname is dbo, else change it.
-------------1. first step --------------
--create table for exercise
CREATE TABLE [dbo].[tblTest](
[ID] [int] NULL,
[isTrue] [bit] NULL
) ON [PRIMARY]
--insert date
insert into tblTest values(1,'true'),(2,'false'),(3,'false'),(4,'true'),(5,'false')
select * from tbltest
-------------2. second step --------------
--now start to get column name one by one
DECLARE #TableName nvarchar(256) = '[dbo].[tblTest]',
#SearchStr nvarchar(128)='id', #SearchStr2 nvarchar(110) --this is used to get only particular column result, to check remove uncomment in cursor
SET #SearchStr2 = QUOTENAME('%' + #SearchStr + '%','''')
DECLARE #Columnname varchar(100) ,#ColumnIndex int --, #PurchaseQty int -- declare temp variable which you u
CREATE TABLE #Results (ColumnName nvarchar(370), ColumnValue nvarchar(3630), ColIndex int)
DECLARE getItemID CURSOR
FOR
select column_name, ordinal_position from INFORMATION_SCHEMA.COLUMNS where TABLE_NAME = PARSENAME(#TableName, 1)
OPEN getItemID
FETCH NEXT FROM getItemID INTO #Columnname, #ColumnIndex
WHILE ##FETCH_STATUS = 0
BEGIN
--select #Columnname, #ColumnIndex ;
INSERT INTO #Results
EXEC
(
'SELECT ''' + #ColumnName + ''', LEFT(' + #ColumnName + ', 3630) , '+ #ColumnIndex +'
FROM ' + #TableName + ' (NOLOCK) '
--remove this to get only particular column entry
--+' WHERE ' + #ColumnName + ' LIKE ' + #SearchStr2
)
FETCH NEXT FROM getItemID INTO #Columnname, #ColumnIndex
END
CLOSE getItemID
DEALLOCATE getItemID
select * from #Results
drop table #Results
DECLARE #cols AS NVARCHAR(max),
#calCols AS NVARCHAR(max),
#query AS NVARCHAR(max)
SELECT *
INTO #temptable
FROM (SELECT journeyid,
notchl,
Cast(Sum(Datediff(second, starttime, endtime)) AS FLOAT) AS
Duration
FROM (SELECT notchlog.*,
CASE
WHEN ( Isnumeric(notch) = 1
AND notch < 0 ) THEN 'DYN'
WHEN notch = 'I' THEN 'IDLE'
WHEN notch = 'C' THEN 'COASTING'
ELSE 'N' + notch
END AS NotchL
FROM notchlog)Sub1
GROUP BY journeyid,
notchl)SUB1
SELECT #cols = Stuff(( SELECT ',' + Quotename(notchl)
FROM #temptable
GROUP BY notchl
--order by value
FOR XML PATH(''), TYPE
).value('.', 'NVARCHAR(MAX)')
, 1, 1, '')
SELECT #calCols = Stuff((SELECT ',' + 'ROUND(' + Quotename(notchl)
+ '*100/RunningTime,2) as '
+ Quotename(notchl)
FROM #temptable
GROUP BY notchl
--order by value
FOR XML PATH(''), TYPE
).value('.', 'NVARCHAR(MAX)')
, 1, 1, '')
SET #query =N'Select * INTO #ResultTable FROM( SELECT Journeyid, '
+ #cols + ' from ( select Journeyid, NotchL, Duration from #TempTable Group By JourneyId,NotchL,Duration ) x pivot ( max(x.duration) for NotchL in ('
+ #cols
+ ') ) p ) Sub2 select NL.JourneyId,RunningTime,'
+ #calCols
+ N' from #ResultTable R INNER Join (Select JourneyID,Sum(DateDiff(second,starttime,endtime)) as RunningTime FROM NotchLog Group By JourneyID)NL ON NL.JourneyID=R.JourneyId INNER Join Journeys J ON J.JourneysID=R.JourneyID Drop Table #ResultTable '
EXEC Sp_executesql
#query;
DROP TABLE #temptable

SQL schema and value

I have a select statement I want to make. I want to select
SELECT COLUMN_NAME AS FieldName FROM
INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME = 'table1'
However I want to create another column named Value which is a particular row in table1
so I have rows of the column name and the corresponding single value. Any thoughts on how to approach this?
The following query produces a value (the minimum) for each column:
SELECT '''select '+COLUMN_NAME+''' AS FieldName, (select cast(MIN('+COLUMN_NAME+') as varchar(8000)) from '+const.tablename+')'
FROM INFORMATION_SCHEMA.COLUMNS c cross join
(select 'AllCurveNames' as tablename) const
WHERE c.TABLE_NAME = const.tablename
However, this produces a separate query for each row. To combine them together, you need a string aggregate concatenation. This is how you would do it in SQL Server:
declare #sql varchar(max);
SELECT #sql = (select 'select '''+COLUMN_NAME+''' AS FieldName, (select cast(MIN('+COLUMN_NAME+') as varchar(8000)) from '+const.tablename + ') union all '
FROM INFORMATION_SCHEMA.COLUMNS c cross join
(select WHATEVER as tablename) const
WHERE c.TABLE_NAME = const.tablename
for xml path('')
);
select #sql = LEFT(#sql, len(#sql) - 9);
exec(#sql);
Use a cross join, which is implicit if you just select from two tables with no join (i.e., from t1, t2):
SELECT COLUMN_NAME AS FieldName,
Table1.MyField
FROM
INFORMATION_SCHEMA.COLUMNS, Table1
WHERE
TABLE_NAME = 'table1'
AND
MyTable.ID = 123
I actually came up with a bit of a crazy solution but it works:
declare #tbl_name as varchar(255)
declare #field as varchar(255)
declare #val as varchar(255)
declare #SQL as nvarchar(4000)
create table #tbl ( [FieldName][varchar](255), [FieldVal][varchar](255))
set #tbl_name = 'table1'
DECLARE mah_cursor CURSOR FAST_FORWARD
FOR
SELECT COLUMN_NAME FROM
INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME = #tbl_name
OPEN mah_cursor
FETCH NEXT FROM mah_cursor INTO #field
WHILE ##FETCH_STATUS = 0
BEGIN
set #SQL = 'set #val = (Select top 1 ' + #field + ' from ' + #tbl_name + ')'
print #SQL
exec sp_executesql #query = #SQL, #params = N'#val varchar(255) OUTPUT', #val = #val OUTPUT
insert into #tbl ([FieldName],[FieldVal] ) values (#field, #val)
FETCH NEXT FROM mah_cursor INTO #field
END
CLOSE mah_cursor
DEALLOCATE mah_cursor
select * from #tbl
drop table #tbl
It loops through each value and adds it. The Fast_Forward feature optimizes the query for high performance