how to check for duplicate rows of all the columns - sql

I want to check for duplicate rows . and see there column values .
if there were only few columns in my table - 2 for example - I would have done something like:
'''
select col1, col2 ,count(*)
from mytable
group by col1,col2
having count(*) > 1.
'''
but I have dozens of column in my table .... and using the above syntax is tedious to specify all the columns in the table.
trying another approach with select distinct ... will not identify for me the content of duplicated rows .
I tried somthing like
'''
select * , count (*)
from my table
group by *
'''
but that doesn't work.

Write a query which will write a query for you.
For example, "john smith" is a duplicate here:
SQL> select * from my_data order by 1;
FULL_NAME FIRST_NAME LAST_NAME
---------- -------------------- --------------------
h gonzalez h gonzalez
john smith john smith
john smith john smith
rudy chan rudy chan
Query uses user_tab_columns and aggregates all column names, concatenating them to the rest of a select statement:
SQL> SELECT 'select '
2 || LISTAGG (column_name, ', ') WITHIN GROUP (ORDER BY column_id)
3 || ', count(*) cnt '
4 || CHR (10)
5 || ' from '
6 || table_name
7 || CHR (10)
8 || ' group by '
9 || LISTAGG (column_name, ', ') WITHIN GROUP (ORDER BY column_id)
10 || CHR (10)
11 || ' having count(*) > 1;' statement_to_run
12 FROM user_tab_columns
13 WHERE table_name = 'MY_DATA'
14 GROUP BY table_name;
STATEMENT_TO_RUN
--------------------------------------------------------------------------------
select FULL_NAME, FIRST_NAME, LAST_NAME, count(*) cnt
from MY_DATA
group by FULL_NAME, FIRST_NAME, LAST_NAME
having count(*) > 1;
Now, copy/paste the above statement_to_run and get the result:
SQL> select FULL_NAME, FIRST_NAME, LAST_NAME, count(*) cnt
2 from MY_DATA group by
3 FULL_NAME, FIRST_NAME, LAST_NAME having count(*) > 1;
FULL_NAME FIRST_NAME LAST_NAME CNT
---------- -------------------- -------------------- ----------
john smith john smith 2
SQL>

Just write out all the columns.
there are dozens of columns ,about 30 , and there names look like : 'AGtrf-456F_RValue'
Copy-paste.
In SQL*Plus you can use the DESCRIBE command to describe a table and you can copy the column names from the table description.
Or you can list all the columns using:
SELECT '"' || column_name || '",'
FROM user_tab_columns
WHERE table_name = 'MY_DATA'
ORDER BY column_id;
And then copy-paste the output into your query into the SELECT and GROUP BY clauses.
Can you generate the query automatically.
Yes, but it usually is not worth it as it takes longer to write a query to generate the query than it does just to list the columns and copy-paste.
If you have lots of column names that require you to use quoted identifiers (i.e. they are mixed-case or use non-standard characters like -) then you can use:
SELECT EMPTY_CLOB()
|| 'SELECT '
|| LISTAGG('"' || column_name || '"', ',') WITHIN GROUP (ORDER BY column_id)
|| ', COUNT(1) FROM MY_DATA GROUP BY '
|| LISTAGG('"' || column_name || '"', ',') WITHIN GROUP (ORDER BY column_id)
|| ' HAVING COUNT(1) > 1;'
FROM user_tab_columns
WHERE table_name = 'MY_DATA'
ORDER BY column_id;
Which works unless you have too many columns and LISTAGG exceeds 4000 characters then you would need to use something like:
WITH columns (col, pos) AS (
SELECT '"' || column_name || '",',
column_id
FROM user_tab_columns
WHERE table_name = 'MY_DATA'
ORDER BY column_id
)
SELECT sql
FROM (
SELECT 'SELECT ' AS sql, 0 FROM DUAL
UNION ALL
SELECT col, pos FROM columns
UNION ALL
SELECT ' COUNT(1) FROM MY_DATA GROUP BY ', 10000 FROM DUAL
UNION ALL
SELECT col, 10000 + pos FROM columns
UNION ALL
SELECT '1 HAVING COUNT(1) > 1', 20000 FROM DUAL
ORDER BY 2
)
fiddle

Related

ORA-01489 : result of string concatenation to long

I am creating a dynamic query from a list of tables which are their in my table called : get_table_names
My Query :
SELECT listagg('SELECT '
|| ''''
|| tbl_name
|| ''''
|| ' AS TBL , COUNT(*) as CNT FROM '
|| 'TGT_MB.'
|| tbl_name
|| ' union all'
|| CHAR(10) ) WITH GROUP ( ORDER BY tbl_name) AS sql_str
FROM get_table_names;
My get_table_names as lots of table , at least 70 table names.
The query works fine for 10 tables but more then that it throws the error like below
ORA-01489 : result of string concatenation to long
Their is some option called EXTEND that option I cannot touch as I have low level privileges.
cannot make changes to it until DBA.
Any other work around would be much appreciated like using some XMLAGG or CLOB , BLOB
Can I respectively suggest that when posting a code snippet, make it sure it runs so that people can help you more easily. eg your code above took the following debugging before we could even start to see what you want to do
SQL> SELECT listagg('SELECT '
2 || ''''
3 || table_name
4 || ''''
5 || ' AS TBL , COUNT(*) as CNT FROM '
6 || 'TGT_MB.'
7 || table_name
8 || ' union all'
9 || CHAR(10) ) WITH GROUP ( ORDER BY table_name) AS sql_str
10 FROM dba_tables
11 where owner = 'SCOTT';
|| CHAR(10) ) WITH GROUP ( ORDER BY table_name) AS sql_str
*
ERROR at line 9:
ORA-00936: missing expression
SQL> SELECT listagg('SELECT '
2 || ''''
3 || table_name
4 || ''''
5 || ' AS TBL , COUNT(*) as CNT FROM '
6 || 'TGT_MB.'
7 || table_name
8 || ' union all'
9 || chr(10) ) WITH GROUP ( ORDER BY table_name) AS sql_str
10 FROM dba_tables
11 where owner = 'SCOTT';
|| chr(10) ) WITH GROUP ( ORDER BY table_name) AS sql_str
*
ERROR at line 9:
ORA-00923: FROM keyword not found where expected
Once those fixes are done, we end up here
SQL> SELECT listagg('SELECT '
2 || ''''
3 || table_name
4 || ''''
5 || ' AS TBL , COUNT(*) as CNT FROM '
6 || 'TGT_MB.'
7 || table_name
8 || ' union all'
9 || chr(10),'' ) WITHIN GROUP ( ORDER BY table_name) AS sql_str
10 FROM dba_tables
11 where owner = 'SCOTT';
SQL_STR
-------------------------------------------------------------------------------------
SELECT 'BONUS' AS TBL , COUNT(*) as CNT FROM TGT_MB.BONUS union all
SELECT 'DEPT' AS TBL , COUNT(*) as CNT FROM TGT_MB.DEPT union all
SELECT 'EMP' AS TBL , COUNT(*) as CNT FROM TGT_MB.EMP union all
SELECT 'EMP2' AS TBL , COUNT(*) as CNT FROM TGT_MB.EMP2 union all
SELECT 'SALGRADE' AS TBL , COUNT(*) as CNT FROM TGT_MB.SALGRADE union all
which ultimately blows up when we get too many tables
SQL> SELECT listagg('SELECT '
2 || ''''
3 || table_name
4 || ''''
5 || ' AS TBL , COUNT(*) as CNT FROM '
6 || 'TGT_MB.'
7 || table_name
8 || ' union all'
9 || chr(10),'' ) WITHIN GROUP ( ORDER BY table_name) AS sql_str
10 FROM dba_tables;
FROM dba_tables
*
ERROR at line 10:
ORA-01489: result of string concatenation is too long
Now even if we use a tool (eg
https://github.com/connormcd/listagg_clob) to get the results as a clob, then I imagine you are just going to fire that SQL at the database to get a count of all tables. If that is the case, then why do you need to LISTAGG at all, just build a script to do it, ie
SELECT 'SELECT '
|| ''''
|| table_name
|| ''''
|| ' AS TBL , COUNT(*) as CNT FROM '
|| 'TGT_MB.'
|| table_name
|| ' union all'
FROM dba_tables
Spool that to a file, trim the last union all and run it as a script.
Or even better...think about what benefit the output gives you. Why does anyone need the exact row count? Querying NUM_ROWS from USER_TABLES is probably sufficient.

Count number of null values for every column on a table

I would like to calculate, for each column in a table, the percent of rows that are null.
For one column, I was using:
SELECT ((SELECT COUNT(Col1)
FROM Table1)
/
(SELECT COUNT(*)
FROM Table1)) AS Table1Stats
Works great and is fast.
However, I want to do this for all ~50 columns of the table, and my environment does not allow me to use dynamic SQL.
Any recommendations? I am using snowflake to connect to AWS, but as an end user I am using the snowflake browser interface.
You can combine this as:
SELECT COUNT(Col1) * 1.0 / COUNT(*)
FROM Table1;
Or, if you prefer:
SELECT AVG( (Col1 IS NOT NULL)::INT )
FROM Table1;
You can use a mix of object_construct() and flatten() to move the column names into rows. Then do the math for the values missing:
create or replace temp table many_cols as
select 1 a, 2 b, 3 c, 4 d
union all select 1, null, 3, 4
union all select 8, 8, null, null
union all select 8, 8, 7, null
union all select null, null, null, null;
select key column_name
, 1-count(*)/(select count(*) from many_cols) ratio_null
from (
select object_construct(a.*) x
from many_cols a
), lateral flatten(x)
group by key
;
You can do this using a SQL generator if you don't mind copying the text and running it once it's done.
-- SQL generator option:
select 'select' || listagg(' ((select count(' || COLUMN_NAME || ') from "SNOWFLAKE_SAMPLE_DATA"."TPCH_SF10000"."ORDERS") / ' ||
'(select count(*) from "SNOWFLAKE_SAMPLE_DATA"."TPCH_SF10000"."ORDERS")) as ' || COLUMN_NAME, ',') as SQL_STATEMENT
from "SNOWFLAKE_SAMPLE_DATA"."INFORMATION_SCHEMA"."COLUMNS"
where TABLE_CATALOG = 'SNOWFLAKE_SAMPLE_DATA' and TABLE_SCHEMA = 'TPCH_SF10000' and TABLE_NAME = 'ORDERS'
;
If the copy and paste is not plausible because you need to script it, you can use the results of the SQL generator in a stored procedure I wrote to execute a single line of dynamic SQL:
call run_dynamic_sql(
select 'select' || listagg(' ((select count(' || COLUMN_NAME || ') from "SNOWFLAKE_SAMPLE_DATA"."TPCH_SF10000"."ORDERS") / ' ||
'(select count(*) from "SNOWFLAKE_SAMPLE_DATA"."TPCH_SF10000"."ORDERS")) as ' || COLUMN_NAME, ',') as SQL_STATEMENT
from "SNOWFLAKE_SAMPLE_DATA"."INFORMATION_SCHEMA"."COLUMNS"
where TABLE_CATALOG = 'SNOWFLAKE_SAMPLE_DATA' and TABLE_SCHEMA = 'TPCH_SF10000' and TABLE_NAME = 'ORDERS'
);
If you want the stored procedure, until it's published on Snowflake's blog it's available here: https://snowflake.pavlik.us/index.php/2021/01/22/running-dynamic-sql-in-snowflake/

Oracle: Count non-null fields for each column in a table

I need a query to count the total number of non-null values for each column in a table. Since my table has hundreds of columns I'm looking for a solution that only requires me to input the table name.
Perhaps using the result of:
select COLUMN_NAME from ALL_TAB_COLUMNS where TABLE_NAME='ORDERS';
to get the column names and then a subquery to put counts against each column name? The additional complication is that I only have read-only access to the DB so I can't create any temp tables.
Slightly out of my league with this one so any help is appreciated.
Construct the query in SQL or using a spreadsheet. Then run the query.
For instance, assuming that your column names are simple and don't have special characters:
select replace('select ''[col]'', count([col]) from orders union all ',
'[col]', COLUMN_NAME
) as sql
from ALL_TAB_COLUMNS
where TABLE_NAME = 'ORDERS';
(Of course, this can be adapted for more complex column names, but I'm trying to show the idea.)
Then copy the code, remove the final union all and run it.
You can put this in one string if there are not too many columns:
select listagg(replace('select ''[col]'', count([col]) from orders',
'[col]', COLUMN_NAME
), ' union all '
) within group (order by column_name) as sql
from ALL_TAB_COLUMNS
where TABLE_NAME = 'ORDERS';
You can also use execute immediate using the same query, but that seems like overkill.
If you're happy with the results row-ar rather than column-ar:
SELECT 'SELECT ''dummy'', 0 FROM DUAL' FROM DUAL
UNION ALL
SELECT
' UNION ALL SELECT ''' ||
column_name ||
''', COUNT(' ||
column_name ||
') FROM ' ||
TABLE_NAME
FROM
all_tab_columns
WHERE
table_name = 'ORDERS'
This is an "SQL that writes an SQL" that you can then copy and run to get your answers. Should make a resultset that looks like:
SELECT 'dummy', 0 FROM dual
UNION ALL SELECT 'col1', COUNT(col1) FROM ORDERS
UNION ALL SELECT 'col2', COUNT(col2) FROM ORDERS
...
If you want your results column-ar:
SELECT 'SELECT '
UNION ALL
SELECT
'COUNT(' ||
column_name ||
') as count_' ||
column_name ||
', ' ||
TABLE_NAME
FROM
all_tab_columns
WHERE
table_name = 'ORDERS'
UNION ALL
SELECT 'null as dummy_column FROM ORDERS'
Should make a resultset that looks like:
SELECT
COUNT(col1) as count_col1,
COUNT(col2) as count_col2,
...
null as dummycoll FROM orders
Caveat: I don't have oracle installed anywhere I can test these, it's written from memory and may need some debugging
This will generate the SQL to get the counts in columns and will handle case sensitive column names and column names with non-alpha-numeric characters:
SELECT 'SELECT '
|| LISTAGG(
'COUNT("' || column_name || '") AS "' || column_name || '"',
', '
) WITHIN GROUP ( ORDER BY column_id )
|| ' FROM "' || table_name || '"' AS sql
FROM ALL_TAB_COLUMNS
WHERE TABLE_NAME = 'ORDERS'
GROUP BY TABLE_NAME;
or, if you have a large number of columns that is generating a string longer than 4000 characters you can use a custom aggregation function to aggregate VARCHAR2s into a CLOB and then do:
SELECT 'SELECT '
|| CLOBAgg( 'COUNT("' || column_name || '") AS "' || column_name || '"' )
|| ' FROM "' || table_name || '"' AS sql
FROM ALL_TAB_COLUMNS
WHERE TABLE_NAME = 'ORDERS'
GROUP BY TABLE_NAME;
In Oracle 19 (I used similar code in Ora 12, maybe that works too), this works without generating another select to execute:
select * from
(
select table_name, column_name,
to_number( extractvalue( xmltype(dbms_xmlgen.getxml('select count(to_char(substr('||column_name||',1,1))) c from '||table_name)) ,'/ROWSET/ROW/C')) count
from all_tab_columns where owner = user
)
--where table_name = 'MY_TABLE'
;
It will create XML with count, from which it extracts the current count. The substr and to_char functions here are used to extract first character, so it will works with CLOB columns also

Vertica. Count of Null and Not-Null of all columns of a Table

How can we get null and non-null counts of all columns of a Table in Vertica? Table can have n number of columns and for each column we need to get count of nulls and non-nulls values of that table.
For Example.
Below Table has two columns
column1 Column2
1 abc
pqr
3
asd
5
If its a specific column then we can check like
SELECT COUNT(*) FROM table where column1 is null;
SELECT COUNT(*) FROM table where column1 is not null;
Same query for column2
I checked system tables like projection_storage and others but I cant figure out a generic query which gives details by hard coding only TABLE NAME in the query.
Hello #user2452689: Here is a dynamically generated VSQL statement which meets your requirement of counting nulls & not nulls in N columns. Notice that this writes a temporary SQL file out to your working directory, and then execute it via the \i command. You only need to change the first two variables per table. Hope this helps - good luck! :-D
--CHANGE SCHEMA AND TABLE PARAMETERS ONLY:
\set table_schema '\'public\''
\set table_name '\'dim_promotion\''
---------
\o temp_sql_file
\pset tuples_only
select e'select \'' || :table_schema || e'\.' || :table_name || e'\' as table_source' as txt
union all
select * from (
select
', sum(case when ' || column_name || ' is not null then 1 else 0 end) as ' || column_name || '_NOT_NULL
, sum(case when ' || column_name || ' is null then 1 else 0 end) as ' || column_name || '_NULL' as txt
from columns
where table_schema = :table_schema
and table_name = :table_name
order by ordinal_position
) x
union all
select ' from ' || :table_schema || e'.' || :table_name || ';' as txt ;
\o
\pset tuples_only
\i temp_sql_file
You can use:
select count(*) as cnt,
count(column1) as cnt_column1,
count(column2) as cnt_column2
from t;
count() with a column name or expression counts the number of non-NULL values in the column/expression.
(Obviously, the number of NULL values is cnt - cnt_columnX.)
select column1_not_null
,column2_not_null
,column3_not_null
,cnt - column1_not_null as column1_null
,cnt - column2_not_null as column2_null
,cnt - column3_not_null as column3_null
from (select count(*) as cnt
,count (column1) as column1_not_null
,count (column2) as column2_not_null
,count (column3) as column3_not_null
from mytable
) t

Comparing column by column between two rows in Oracle DB

I need to write a query to compare column by column (ie: find differences) between two rows in the database. For example:
row1: 10 40 sometext 24
row2: 10 25 sometext 24
After the query executed, it should shows only the fields that have difference (ie: the second field)
Here's what I have done so far:
select table1.column1, table1.column2, table1.column3, table1.column4
from table1
where somefield in (field1, field2);
The above query will show me two rows one above another like this:
10 40 sometext 24
10 25 sometext 24
Then I have to manually do the comparison and it takes a lot of time b/c the row contains a lot of column.
So again my question is: How can I write a query that will show me only the columns that have differences??
Thanks
Use UNPIVOT clause (see http://www.oracle-developer.net/display.php?id=506) to turn columns into rows, then filter out the same rows (using GROUP BY HAVING COUNT and finally use PIVOT to get rows with different columns only.
To do this easily you need to query the metadata for the table to get each row. You can use the following code as a script.
Replace the define table_name with your table name and define yes_drop_it = NO. Put your raw WHERE syntax into the where_clause. The comparison logic always compares the first two rows returned for the where clause.
whenever sqlerror exit failure rollback;
set linesize 150
define test_tab_name = tst_cf_cols
define yes_drop_it = YES
define order_by = 1, 2
define where_clause = 1 = 1
define tab_owner = user
<<clearfirst>> begin
for clearout in (
select 'drop table ' || table_name as cmd
from all_tables
where owner = &&tab_owner and table_name = upper('&&test_tab_name')
and '&&yes_drop_it' = 'YES'
) loop
execute immediate clearout.cmd;
execute immediate '
create table &&test_tab_name as
select 10 as column1, 40 as column2, ''sometext'' as column3, 24 as column4 from dual
union all
select 10 as column1, 25 as column2, ''sometext'' as column3, 24 as column4 from dual
';
end loop;
end;
/
column cfsynt format a4000 word_wrap new_value comparison_syntax
with parms as (select 'parmquery' as cte_name, 'row_a' as corr_name_1, 'row_b' as corr_name_2 from dual)
select
'select * from (select ' || LISTAGG(cfcol || ' AS cf_' || trim (to_char (column_id, '000')) || '_' || column_name
, chr(13) || ', ') WITHIN GROUP (order by column_id)
|| chr(13) || ' from (select * from parmquery where row_number = 1) ' || corr_name_1
|| chr(13) || ', (select * from parmquery where row_number = 2) ' || corr_name_2
|| chr(13) || ') where ''DIFFERENT'' IN (' || LISTAGG ('cf_' || trim (to_char (column_id, '000')) || '_' || column_name, chr(13) || ', ') within group (order by column_id) || ')'
as cfsynt
from parms, (
select
'decode (' || corr_name_1 || '.' || column_name || ', ' || corr_name_2
|| '.' || column_name || ', ''SAME'', ''DIFFERENT'')'
as cfcol,
column_name,
column_id
from
parms,
all_tab_columns
where
owner = &&tab_owner and table_name = upper ('&&test_tab_name')
);
with parmquery as (select rownum as row_number, vals.* from (
select * from &&test_tab_name
where &&where_clause
order by &&order_by
) vals
) &&comparison_syntax
;