SQL: count number of distinct values in every column - sql

I need a query that will return a table where each column is the count of distinct values in the columns of another table.
I know how to count the distinct values in one column:
select count(distinct columnA) from table1;
I suppose that I could just make this a really long select clause:
select count(distinct columnA), count(distinct columnB), ... from table1;
but that isn't very elegant and it's hardcoded. I'd prefer something more flexible.

This code should give you all the columns in 'table1' with the respective distinct value count for each one as data.
DECLARE #TableName VarChar (Max) = 'table1'
DECLARE #SqlString VarChar (Max)
set #SqlString = (
SELECT DISTINCT
'SELECT ' +
RIGHT (ColumnList, LEN (ColumnList)-1) +
' FROM ' + Table_Name
FROM INFORMATION_SCHEMA.COLUMNS COL1
CROSS AppLy (
SELECT ', COUNT (DISTINCT [' + COLUMN_NAME + ']) AS ' + '''' + COLUMN_NAME + ''''
FROM INFORMATION_SCHEMA.COLUMNS COL2
WHERE COL1.TABLE_NAME = COL2.TABLE_NAME
FOR XML PATH ('')
) TableColumns (ColumnList)
WHERE
1=1 AND
COL1.TABLE_NAME = #TableName
)
EXECUTE (#SqlString)

try this (sql server 2005 syntax):
DECLARE #YourTable table (col1 varchar(5)
,col2 int
,col3 datetime
,col4 char(3)
)
insert into #YourTable values ('abcdf',123,'1/1/2009','aaa')
insert into #YourTable values ('aaaaa',456,'1/2/2009','bbb')
insert into #YourTable values ('bbbbb',789,'1/3/2009','aaa')
insert into #YourTable values ('ccccc',789,'1/4/2009','bbb')
insert into #YourTable values ('aaaaa',789,'1/5/2009','aaa')
insert into #YourTable values ('abcdf',789,'1/6/2009','aaa')
;with RankedYourTable AS
(
SELECT
ROW_NUMBER() OVER(PARTITION by col1 order by col1) AS col1Rank
,ROW_NUMBER() OVER(PARTITION by col2 order by col2) AS col2Rank
,ROW_NUMBER() OVER(PARTITION by col3 order by col3) AS col3Rank
,ROW_NUMBER() OVER(PARTITION by col4 order by col4) AS col4Rank
FROM #YourTable
)
SELECT
SUM(CASE WHEN col1Rank=1 THEN 1 ELSE 0 END) AS col1DistinctCount
,SUM(CASE WHEN col2Rank=1 THEN 1 ELSE 0 END) AS col2DistinctCount
,SUM(CASE WHEN col3Rank=1 THEN 1 ELSE 0 END) AS col3DistinctCount
,SUM(CASE WHEN col4Rank=1 THEN 1 ELSE 0 END) AS col4DistinctCount
FROM RankedYourTable
OUTPUT:
col1DistinctCount col2DistinctCount col3DistinctCount col4DistinctCount
----------------- ----------------- ----------------- -----------------
4 3 6 2
(1 row(s) affected)

and it's hardcoded.
It is not hardcoding to provide a field list for a sql statement. It's common and acceptable practice.

This won't necessarily be possible for every field in a table. For example, you can't do a DISTINCT against a SQL Server ntext or image field unless you cast them to other data types and lose some precision.

I appreciate all of the responses. I think the solution that will work best for me in this situation (counting the number of distinct values in each column of a table from an external program that has no knowledge of the table except its name) is as follows:
Run "describe table1" and pull out the column names from the result.
Loop through the column names and create the query to count the distinct values in each column. The query will look something like "select count(distinct columnA), count(distinct columnB), ... from table1".

Raj More's answer works well if you don't need to consider null as a value as count(distinct...) does not count null.
Here is a modification to count values including null by converting values to a string and replacing null with "NULL AS SOME IMPOSSIBLE STRING":
DECLARE #TableName VarChar (1024) = 'tableName'
DECLARE #SqlString VarChar (Max)
set #SqlString = (
SELECT DISTINCT
'SELECT ' +
RIGHT (ColumnList, LEN (ColumnList)-1) +
' FROM ' + Table_Name
FROM INFORMATION_SCHEMA.COLUMNS COL1
CROSS AppLy (
SELECT ', COUNT (DISTINCT coalesce(cast([' + COLUMN_NAME + '] as varchar),
''NULL AS SOME IMPOSSIBLE STRING'')) AS ' + '''' + COLUMN_NAME + ''''
FROM INFORMATION_SCHEMA.COLUMNS COL2
WHERE COL1.TABLE_NAME = COL2.TABLE_NAME
FOR XML PATH ('')
) TableColumns (ColumnList)
WHERE
COL1.TABLE_NAME = #TableName
)
EXECUTE (#SqlString)

DISTINCT is evil. Do COUNT/GROUP BY

Related

Divided by some number into column SQL

Well, I have a number which stored in a column. If I want to divide the value, I just need:
select columnName / 12 from myTable
Is it possible to put the result into 12 column? I want to make the 12 is flexible. So for instance, if I divide the value by 4, so the result should be 4 column.
Value Result1 Result2 Result3 Result4
12000 3000 3000 3000 3000
Does anyone know how to achieve this?
Thank you.
This achievable using dynamic query.
declare #cols nvarchar(max);
declare #sql nvarchar(1000);
with cte as (
select 12000 as col1, 12000/4 as col2
union all
select col1-col2, col2 from cte where col1 > col2
)
select #cols =
STUFF((select N',' + QUOTENAME(col2) from cte
FOR XML PATH('')
), 1, 1, '') + N'';
select #cols
You can avoid dynamic SQL if you know a maximum count:
Start with a mockup-table to simulate your issue
DECLARE #tbl TABLE(InitialValue DECIMAL(16,4));
INSERT INTO #tbl VALUES(12000),(20000),(10000);
--To test this I use a divisor of 4, try with other numbers
DECLARE #divisor DECIMAL(16,4)=4;
--This is the query
SELECT p.*
FROM
(
SELECT t.InitialValue / #divisor As DivResult
,t.InitialValue
,CONCAT('div',FORMAT(A.Nmbr,'00')) AS ColumnName
FROM #tbl t
CROSS APPLY(SELECT TOP(CAST(#divisor AS INT)) ROW_NUMBER() OVER(ORDER BY (SELECT NULL)) FROM master..spt_values) A(Nmbr)
) t
PIVOT
(MAX(DivResult) FOR ColumnName IN(div01,div02,div03,div04,div05,div06,div07 /*add as many as you might need*/)) p;
The result
InitialValue div01 div02 div03 div04 div05 div06 div07
10000.0000 2500.000000000000000000000 2500.000000000000000000000 2500.000000000000000000000 2500.000000000000000000000 NULL NULL NULL
12000.0000 3000.000000000000000000000 3000.000000000000000000000 3000.000000000000000000000 3000.000000000000000000000 NULL NULL NULL
20000.0000 5000.000000000000000000000 5000.000000000000000000000 5000.000000000000000000000 5000.000000000000000000000 NULL NULL NULL
As you can see, the unused columns are returned but stay NULL.
I'd prefer this approach over dynamic sql as the consumer is better of in most cases if the result set and its structure is fixed and predictable...
Hint: You can add the divisor to your result set if needed...
It has to be dynamic query
Check below
Create TABLE Table1 (Origional int)
Declare #DivisonValue INT = 4
insert into Table1
VALUES (12000)
--Change the Top (12) to you what ever number you like. this will be your total number of columns
DECLARE #Columns VARCHAR(MAX) = (SELECT
',' + C.ColumnName + ' = Origional / ' + CAST(#DivisonValue AS VARCHAR(5))
/* Above line which you need to change for division */
FROM
(SELECT TOP (#DivisonValue)
ColumnId = ROW_NUMBER() OVER (ORDER BY s1.[object_id])
,ColumnName ='Result' + CONVERT(VARCHAR(10), ROW_NUMBER() OVER (ORDER BY s1.[object_id]))
FROM sys.all_objects AS s1 CROSS JOIN sys.all_objects AS s2
) AS C
ORDER BY
C.ColumnId
FOR XML PATH (''))
DECLARE #FullQuery VARCHAR(MAX) = 'SELECT Origional,'+ substring(#Columns,2,LEN(#Columns)-1) + ' FROM Table1'
EXEC (#FullQuery)
DROP TABLE table1
GO
Change the logic as per your need for division.

MSSQL select lowest but not NULL/zero value from 25 columns

I have 25 (numeric) columns in one table in MSSQL and I need select lowest value, but not NULL or 0 value.
Columns are named like "%_price" (aaa_price, bbb_price, ccc_price...).
Some columns contains 0 or NULL value.
Example:
table (aaa_price, bbb_price, ccc_price, ddd_price, eee_price, fff_price)
value (NULL, 0, 324.23, 162.50, NULL, 1729.72 )
Right result:
162.50
I can use some "brute force" method like:
SELECT CASE
WHEN Col1 <= Col2 AND Col1 <= Col3 AND Col1 <= Col4 AND Col1 <= Col5 THEN Col1
WHEN Col2 <= Col3 AND Col2 <= Col4 AND Col2 <= Col5 THEN Col2
WHEN Col3 <= Col4 AND Col3 <= Col5 THEN Col3
WHEN Col4 <= Col5 THEN Col4
ELSE Col5
END AS [Min Value] FROM [Your Table]
But its insane with 25 columns... is there any better solution?
Thank You!
Cross apply can be good option in this case:
select
*
from
myTable
cross apply (select
minVal = min(val)
from (
values (aaa_price),(bbb_price),(...)
) t(val) where val > 0
) q
Edit:
You have to use dynamic SQL if you want to get column names from INFORMATION_SCHEMA.COLUMNS table.
declare #sql varchar(8000)
declare #cols varchar(8000)
select #cols =
stuff((
SELECT
',(' + COLUMN_NAME + ')'
FROM
INFORMATION_SCHEMA.COLUMNS
WHERE
TABLE_NAME = 'mytable'
AND TABLE_SCHEMA='dbo'
AND COLUMN_NAME LIKE '%price'
for xml path('')
), 1, 1, '')
set #sql = 'select
*
from
mytable
cross apply (select
minVal = min(val)
from (
values ' + #cols + '
) t(val) where val > 0
) q'
exec (#sql)
You can create a dynamic SQL statement and execute it in the following form
declare #tablename sysname = 'MultipleNumericColumns'
declare #sql nvarchar(max)
select #sql = isnull(#sql + ' union all ','') + '
SELECT ' + name + ' as colname from ' + #tablename
from sys.all_columns
where
object_id = OBJECT_ID(#tablename)
set #sql = '
select min(colname)
from (
' + #sql + '
) t
where colname > 0'
EXECUTE(#sql)
You can realize that first I get the column names from system view
You can exclude columns that you don't want or use a pattern like name like '%price% etc at this step
Then I build a dynamic SQL query into a string variable as sql command
Please note that I use WHERE clause for greater than 0, etc
Final step is execution with EXECUTE command
Use UNPIVOT
SELECT Min(VALUE)
FROM (
SELECT Col1, Col2, Col3...
FROM YourTable
) t
UNPIVOT (VALUE FOR ITEM IN (Col1, Col2, Col3...)) u
WHERE VALUE != 0

Can you MAX(*) in SQL Server?

I am dealing with a huge list of columns (around 50) where i only need to group by one column. Is there anyway in SQL Server i can aggregate the columns by something such as
SELECT MAX(*)
FROM View1
GROUP BY Column1
instead of having to go through each one and specify an aggregate function. I have had a look online but cant find anything. Is there any advice or guidance someone can give me or is it just a case of going through each row?
Thanks
You can build query you need using system tables:
DECLARE #ViewName sysname = N'View1',
#query nvarchar(max),
#Column sysname = 'Column1'
SET #query = N'SELECT ' + #Column + ',' + CHAR(10)
SELECT #query = #query + N'MAX('+c.[name]+') as '+c.[name]+',' + CHAR(10)
FROM sys.views v
INNER JOIN sys.columns c
ON v.[object_id] = c.[object_id]
WHERE v.[name] = #ViewName AND c.[name] != #Column
SET #query = STUFF(#query,LEN(#query)-1,1,'') + 'FROM '+#ViewName + CHAR(10) + 'GROUP BY ' + #Column
PRINT #query
Output will be:
SELECT Column1,
MAX(Column2) as Column2,
MAX(Column3) as Column3,
...
MAX(ColumnN) as ColumnN
FROM View1
GROUP BY Column1
You can Ctrl+C Ctrl+V on new query window and execute, or execute it right here with:
EXEC (#query)
In case of tables - you need to use sys.tables
In case if view or table is not in default schema - you need to specify it manually.
SET #query = STUFF(#query,LEN(#query)-1,1,'') + 'FROM dbo.'+#ViewName + CHAR(10) + 'GROUP BY ' + #Column
No. Unfortunately, you're going to have to write out the columns.
If you need max value from different columns you can try as below
Select max(yourcolumn) from
(
Select col1 from yourtable
union all Select col2 from yourtable
union all Select col3 from yourtable
...
) a
No, you can not use max(*).
You will have to give a column name in max function like below
select max(column_name) from table_name;
You cannot write select max(*). This results in the error Incorrect syntax near '*'. Instead, you will need to specify the columns.
One way to get the maximum of multiple columns is to unpivot the table. An efficient way to do this is to use cross apply to generate separate row values for each column.
For example, the code below finds the maximum value across 3 different columns in all of the rows:
declare #test table
(
id int primary key clustered,
value1 int,
value2 int,
value3 int
)
insert into #test (id, value1, value2, value3)
values (1, 100, 0, 0), (2, 0, 50, 0), (3, 0, 0, 25)
select max(TestValue) -- returns 100
from #test
cross apply
(
values(value1),(value2), (value3)
) TestValues (TestValue)

One view from two tables with identical column names

We have two tables that we need to merge into a singular view. Normally I'd individually select columns to avoid this issue, however in this case the two tables are a combined 800 columns.
The only identical columns are the identifier columns. Unfortunately these cannot be changed as they are used by a 3rd party tool to sync table
Table A
GUID
Name
Address
...
Table B
GUID
Cell
Fax
Home2
...
Are good examples, just assume each table has 400 odd columns.
Obviously the traditional
SELECT a.*, b.* from table_a a, table_b a where a.guid = b.guid
Fails miserably. Is there any easy way to create the view without having to list out 799 individual column names? I was thinking perhaps a one off function to create the view but so far I'm hitting a wall.
You can use dynamic sql as a solution.
CREATE TABLE test1 (id INT, col1 NVARCHAR(50), col2 NVARCHAR(50))
GO
CREATE TABLE test2(id INT, col1 NVARCHAR(50), col2 NVARCHAR(50))
GO
DECLARE #sql NVARCHAR(max) = ''
; WITH cte AS (
SELECT
CASE WHEN TABLE_NAME = 'test1' THEN TABLE_NAME + '.' + COLUMN_NAME + ' AS ' + + COLUMN_NAME + 't1' ELSE TABLE_NAME + '.' + COLUMN_NAME + ' AS ' + + COLUMN_NAME + 't2' END AS a, 1 AS ID
FROM INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME IN ('test1', 'test2')
)
SELECT #sql =
'CREATE VIEW myview as
select ' + (
SELECT
STUFF(
(
SELECT ', '+ [A]
FROM cte
WHERE ID = results.ID
FOR XML PATH(''), TYPE
).value('(./text())[1]','VARCHAR(MAX)')
,1,2,''
) AS NameValues
FROM cte results
GROUP BY ID
) + ' from test1 join test2 on test1.id = test2.id'
PRINT #sql
--EXEC (#sql)
The result is
CREATE VIEW myview
AS
SELECT test1.id AS idt1 ,
test1.col1 AS col1t1 ,
test1.col2 AS col2t1 ,
test2.id AS idt2 ,
test2.col1 AS col1t2 ,
test2.col2 AS col2t2
FROM test1
JOIN test2 ON test1.id = test2.id

how to select columns by column index in sql? [duplicate]

Is there a way to access columns by their index within a stored procedure in SQL Server?
The purpose is to compute lots of columns. I was reading about cursors, but I do not know how to apply them.
Let me explain my problem:
I have a row like:
field_1 field_2 field_3 field_4 ...field_d Sfield_1 Sfield_2 Sfield_3...Sfield_n
1 2 3 4 d 10 20 30 n
I need to compute something like (field_1*field1) - (Sfield_1* Sfiled_1) / more...
So the result is stored in a table column d times.
So the result is a d column * d row table.
As the number of columns is variable, I was considering making dynamic SQL, getting the names of columns in a string and splitting the ones I need, but this approach makes the problem harder. I thought getting the column number by index could make life easier.
No, you can not use the ordinal (numeric) position in the SELECT clause.
Only in the ORDER BY clause can you use the ordinal position, because it's based on the column(s) specified in the SELECT clause.
First, as OMG Ponies stated, you cannot reference columns by their ordinal position. This is not an accident. The SQL specification is not built for dynamic schema either in DDL or DML.
Given that, I have to wonder why you have your data structured as you do. A sign of a mismatch between schema and the problem domain rears itself when you try to extract information. When queries are incredibly cumbersome to write, it is an indication that the schema does not properly model the domain for which it was designed.
However, be that as it may, given what you have told us, an alternate solution would be something like the following: (I'm assuming that field_1*field1 was meant to be field_1 * field_1 or field_1 squared or Power( field_1, 2 ) )
Select 1 As Sequence, field_1 As [Field], Sfield_1 As [SField], Sfiled_1 As [SFiled]
Union All Select 2, field_2, Sfield_2, Sfiled_2
...
Union All Select n, field_n, Sfield_n, Sfiled_n
Now your query looks like:
With Inputs As
(
Select 1 As Sequence, field_1 As [Field], Sfield_1 As [SField], Sfiled_1 As [SFiled]
Union All Select 2, field_2, Sfield_2, Sfiled_2
....
)
, Results As
(
Select Case
When Sequence = 1 Then Power( [Field], 2 ) - ( [SField] * [SFiled] )
Else 1 / Power( [Field], 2 ) - ( [SField] * [SFiled] )
End
As Result
From Inputs
)
Select Exp( Sum( Log( Result ) ) )
From Results
This might not be the most elegant or efficient but it works. I am using it to create a new table for faster mappings between data that I need to parse through all the columns / rows.
DECLARE #sqlCommand varchar(1000)
DECLARE #columnNames TABLE (colName varchar(64), colIndex int)
DECLARE #TableName varchar(64) = 'YOURTABLE' --Table Name
DECLARE #rowNumber int = 2 -- y axis
DECLARE #colNumber int = 24 -- x axis
DECLARE #myColumnToOrderBy varchar(64) = 'ID' --use primary key
--Store column names in a temp table
INSERT INTO #columnNames (colName, colIndex)
SELECT COL.name AS ColumnName, ROW_NUMBER() OVER (ORDER BY (SELECT 1))
FROM sys.tables AS TAB
INNER JOIN sys.columns AS COL ON COL.object_id = TAB.object_id
WHERE TAB.name = #TableName
ORDER BY COL.column_id;
DECLARE #colName varchar(64)
SELECT #colName = colName FROM #columnNames WHERE colIndex = #colNumber
--Create Dynamic Query to retrieve the x,y coordinates from table
SET #sqlCommand = 'SELECT ' + #colName + ' FROM (SELECT ' + #colName + ', ROW_NUMBER() OVER (ORDER BY ' + #myColumnToOrderBy+ ') AS RowNum FROM ' + #tableName + ') t2 WHERE RowNum = ' + CAST(#rowNumber AS varchar(5))
EXEC(#sqlCommand)