How to Null check multiple columns, with casting reporting elements - sql

Looking for the most efficient way to check for nulls and have a desired output for a report. This is done in a Hadoop environment.
For example,
Database contains:
FirstName
LastName
State
John
{null}
OH
Jane
Smith
{null}
Pikachu
Pokemon
HI
The output from the query would be:
RuleNum
ColName
DefectCount
TableCount
DateofData
Rule001
FirstName
0
50
2023-01-01
Rule002
LastName
1
50
2023-01-01
Rule003
State
1
50
2023-01-01
The embarassingly clunky and inefficient query that that I use, which works- but it takes a LONG time to get results is(and we check thousands of columns)- and i'm unsure as to how to get the table count included efficiently:
SELECT
CAST('Rule001' AS VARCHAR(255)) AS RuleNum,
CAST('FirstName' AS VARCHAR(255)) AS ColName,
A.DefectCount AS DefectCount,
CAST(('date_add(current_date(),-1)) AS VARCHAR(255) AS DateofData
FROM
(SELECT COUNT(1) AS DefectCount
FROM DbName.TableName
WHERE FirstName IS NULL
UNION ALL
SELECT
CAST('Rule002' AS VARCHAR(255)) AS RuleNum,
CAST('LastName' AS VARCHAR(255)) AS ColName,
A.DefectCount AS DefectCount,
CAST(('date_add(current_date(),-1)) AS VARCHAR(255) AS DateofData
FROM
(SELECT COUNT(1) AS DefectCount
FROM DbName.TableName
WHERE LastName IS NULL
UNION ALL
SELECT
CAST('Rule003' AS VARCHAR(255)) AS RuleNum,
CAST('State' AS VARCHAR(255)) AS ColName,
A.DefectCount AS DefectCount,
CAST(('date_add(current_date(),-1)) AS VARCHAR(255) AS DateofData
FROM
(SELECT COUNT(1) AS DefectCount
FROM DbName.TableName
WHERE State IS NULL

Related

Is there a way to transpose output of table as column headers?

I have a table candidate
id candidate_name
---------------------------
1 john
2 mary
and another table units
id name
--------
1 unit1
2 unit2
3 unit3
i would like to generate an output as
id candidate_name unit1 unit2 unit3
---------------------------------------
1 john null null null
2 mary null null null
Any way I can achieve this?
your data
CREATE TABLE candidate(
id int NOT NULL
,candidate_name VARCHAR(40)
);
INSERT INTO candidate
(id,candidate_name) VALUES
(1,'john'),
(2,'mary');
CREATE TABLE units(
id int NOT NULL
,name VARCHAR(50)
);
INSERT INTO units
(id,name) VALUES
(1,'unit1'),
(2,'unit2'),
(3,'unit3');
you should use Cross Join and Pivot
select
*
from
(
select
c.id,
candidate_name,
cast(null as int) id1,
name
from
candidate c
CROSS JOIN units u
) src pivot (
max(id1) for name in ([unit1], [unit2], [unit3])
) piv;
using Dynamic Sql
DECLARE #SQL nvarchar(max);
DECLARE #names nvarchar(1000)= (
SELECT STRING_AGG(concat('[',name,']'),',')
WITHIN GROUP (order by id) from units)
set #SQL='select
*
from
(
select
c.id,
candidate_name,
cast(null as int) id1,
name
from
candidate c
CROSS JOIN units u
) src pivot (
max(id1) for name in ('+ #names +' )
) piv;'
exec(#SQL)
dbfiddle

Searching a column value that has couple records with some different specified values

I have a database:
|id|surname|name
| 1|Smith |John
| 2|Smith |Mike
| 3|Smith |Bob
| 4|Knope |John
| 5|Knope |Mike
| 6|Knope |Dick
| 7|Pratt |John
| 8|Pratt |Jill
| 9|Pratt |James
and I want to find a family name that has in it John, Mike and Bob. I want it to return Smith. Or I want to search for a family that has in it John and Mike and I want it to return Smith and Knope. How can I do that?
the results that I want to get are above but here's the same with nicer form:
I'm looking for family that has Jon, Mike, Bob. I want to get:
|surname|
|Smith |
Then I want to look for John and Mike only and I want to get:
|surname|
|Smith |
|Knope |
Based on your explantions you can do something like this:
declare #table table (it int, surename varchar(50),name1 varchar(50))
insert into #table
values
(1,'Smith','John'),
(2,'Smith','Mike'),
(3,'Smith','Bob' ),
(4,'Knope','John'),
(5,'Knope','Mike'),
(6,'Knope','Dick'),
(7,'Pratt','John'),
(8,'Pratt','Jill'),
(9,'Pratt','James')
select * from #table
where name1 in ('john','mike','bob') and surename = 'smith'
union
select * from #table
where name1 in('john','mike') and surename in ('smith','knope')
Result
Your input has dynamically multiple values. Thus, I will assume you have put all of the names you want the family to include in a table #i with field "name". Then, you want all families for which no name specified in your input is missing.
select *
from (select distinct surename from yourtable)surnames
where not exists
(
select 1 from #i
where not exists
(
select 1
from yourtable t
where
t.surename=surnames.surename
and #i.name=t.name
)
)
This works in SQL Server - written before your question was tagged with PostgreSQL.
Set up the test data:
DECLARE #Names TABLE (ID INTEGER IDENTITY, Surname VARCHAR(50), Forenames VARCHAR(50));
INSERT
#Names (Surname, Forenames)
VALUES
('Smith', 'John'),
('Smith', 'Mike'),
('Smith', 'Bob' ),
('Knope', 'John'),
('Knope', 'Mike'),
('Knope', 'Dick'),
('Pratt', 'John'),
('Pratt', 'Jill'),
('Pratt', 'James');
Declare a table variable containing the forenames you'd like to match. This acts as a parameter, so you should edit the values we insert to test the results:
DECLARE #ForenamesToSearch TABLE (Forenames VARCHAR(50));
INSERT
#ForenamesToSearch
VALUES
('John')
, ('Mike')
, ('Bob');
Finally we use GROUP BY and HAVING COUNT to ensure the number of names matches exactly.
SELECT
Surname
FROM
(SELECT DISTINCT Forenames, Surname FROM #Names) Names
INNER JOIN #ForenamesToSearch Forenames ON Names.Forenames = Forenames.Forenames
GROUP BY
Surname
HAVING
COUNT(1) = (SELECT COUNT(1) FROM #ForenamesToSearch);
Probably not the best way to do this but you can try the following for Postgresql:
select *
from
(
select
concat(',' , string_agg(name1,',') , ',') as X,
surname
from
table_name as A
group BY
surname
) As B
Where B.X like '%,John,%' And B.X like '%,Mike,%' And B.X like '%,Bob,%';
SQLFIDDLE DEMO
The following is for SQL server:
select * from
(
select
', ' + STUFF((SELECT ', ' + name1 FROM table_name WHERE surname = A.surname FOR XML PATH('')),1,2,'') + ',' as X,
surname
from
table_name as A
group BY
surname
) as B
Where B.X like '%, John,%' And B.X like '%, Mike,%' And B.X like '%, Bob,%';
SQLFIDDLE DEMO

Getting different column values into one single column

enter image description hereI have 5 columns and i want to get values from those 5 columns and put them into a single column.
+---------------+----------+----------+----------+---------------+
| Option_1 | Option_2 | Option_3 | Option_4 | Option_5 |
+---------------+----------+----------+----------+---------------+
| Below average | Average | Good | Better | Above average |
+---------------+----------+----------+----------+---------------+
So in output i want Below Average, Average, Good, Better and above Average in one column Named Option
Union all can be done:
SELECT
Option1 AS Options
FROM (SELECT
option1
FROM tablename
UNION ALL
SELECT
option2
FROM tablename
UNION ALL
SELECT
option3
FROM tablename
UNION ALL
SELECT
option4
FROM tablename
UNION ALL
SELECT
option5
FROM tablename) tablename;
or
UNPIVOT performs almost the reverse operation of PIVOT, by rotating columns into rows.
SELECT
opt.Options
FROM (SELECT
*
FROM yourtable) t
UNPIVOT (Options FOR tables IN ([Option_1], [Option_2], [Option_3], [Option_4], [Option_5])) AS opt
ORDER BY tables;
For more info please visit: https://learn.microsoft.com/en-us/sql/t-sql/queries/from-using-pivot-and-unpivot
declare #table table( Option_1 varchar(50),Option_2 varchar(50), Option_3 varchar(50), Option_4 varchar(50), Option_5 varchar(50))
insert into #table
select 'Below average', 'Average', 'Good','Better','Above average'
select p.typename,p.numericvalue from (
select *
from #table
) a
unpivot(numericvalue for typename in([Option_1] ,[Option_2] , [Option_3] , [Option_4] , [Option_5] )) as p
order by typename
output
typename numericvalue
Option_1 Below average
Option_2 Average
Option_3 Good
Option_4 Better
Option_5 Above average
Sample Data
IF OBJECT_ID('dbo.Sampletable')IS NOT NULL
DROP TABLE Sampletable
GO
CREATE TABLE Sampletable (
Option_1 varchar(50)
,Option_2 varchar(50)
,Option_3 varchar(50)
,Option_4 varchar(50)
,Option_5 varchar(50)
)
INSERT INTO Sampletable
SELECT 'Below average', 'Average', 'Good','Better','Above average'
Using Dynamic Sql and Cross apply we get the expected result
DECLARE #DynamicCol nvarchar(max),#Sql nvarchar(max)
SELECT #DynamicCol=
STUFF((SELECT ', '+'('+''''+COLUMN_NAME+''''+','+ COLUMN_NAME +')'FROM INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME='Sampletable'
FOR XML PATH ('')),1,1,'')
SET #Sql='
SELECT Typename
,Numericvalue
FROM Sampletable t
CROSS APPLY (VALUES '+#DynamicCol+
') AS dt (Typename,Numericvalue )'
PRINT #Sql
EXEC (#Sql)
Sample demo
Result
Typename Numericvalue
-------------------------
Option_1 Below average
Option_2 Average
Option_3 Good
Option_4 Better
Option_5 Above average
SELECT CONCAT(Option_1,', ',Option_2,', ',Option_3,' ,',Option_4,',',Option_5) as Option
FROM TAble1
if you want results as a one column but multiple rows
SELECT Option_1 AS Option FROM Table1
UNION ALL
SELECT Option_2 AS Option FROM Table1
UNION ALL
SELECT Option_3 AS Option FROM Table1
UNION ALL
SELECT Option_4 AS Option FROM Table1
UNION ALL
SELECT Option_5 AS Option FROM Table1
Using UNPIVOT
SELECT
rslt.Options
FROM (SELECT
*
FROM Table1 ) AS dt
UNPIVOT (Options FOR tables IN ([Option_1], [Option_2], [Option_3], [Option_4], [Option_5])) AS rslt
Sample Link

How to find sets of rows with one or more fields matching and assign a set id for each matching set?

I have a requirement to find sets of rows where one or more fields are matching.
E.g:
Vendor Master
VendorId | VendorName | Phone | Address | Fax
------------------------------------------------------------------------
1 AAAA 10101 Street1 111
2 BBBB 20202 Street2 222
3 CCCC 30303 Street3 333
4 DDDD 40404 Street2 444
5 FFFF 50505 Street5 555
6 GGGG 60606 Street6 444
7 HHHH 10101 Street6 777
SELECT VendorId FROM VendorMaster vm
WHERE EXISTS
( Select 1 FROM VendorMaster vm1
WHERE vm1.VendorId <> vm2.VendorId
AND (vm1.Phone = vm2.Phone OR vm1.Address=vm2.Address OR vm1.Fax = vm2.Fax)
With the above query I am getting records, but my requirement is to assign a set-id for each set of matching records.
Like below:
SetId | VendorId
---------------------
1000 1
1000 7 //1 and 7- Phone numbers are matching
1001 2
1001 4 //2 and 4 - Address matching
1001 6 // 4 and 6 - Fax matching
Please advise me on how to write a query to assign set ids for matching sets. The performance of the query is also key here as the number of records will be around 100,000.
Thanks
I believe this will give you your desired result. A little explanation is in the comments, let me know if more is needed.
with relations
--Get all single relationships between vendors.
as (
select t1.vendorId firstId,
t2.vendorId secondId
from VendorMaster t1
inner join VendorMaster t2 on t1.vendorId < t2.vendorId and(
t1.Phone = t2.Phone
or t1.address = t2.address
or t1.Fax = t2.Fax
)
),
recurseLinks
--Recurse the relationships
as (
select r.*, CAST(',' + CAST(r.firstId AS VARCHAR) + ',' AS VARCHAR) tree
from relations r
union all
select r.firstId,
l.secondId,
cast(r.Tree + CAST(l.secondId AS varchar) + ',' as varchar)
from relations l
inner join recurseLinks r on r.secondId = l.firstId and r.tree not like '%' + cast(l.secondId as varchar) + ',%'
union all
select r.firstId,
l.firstId,
cast(r.Tree + CAST(l.firstId AS varchar) + ',' as varchar)
from relations l
inner join recurseLinks r on r.secondId = l.secondId and r.tree not like '%' + cast(l.firstId as varchar) + ',%'
),
removeInvalid
--Removed invalid relationships.
as (
select l1.firstId, l1.secondId
from recurseLinks l1
where l1.firstId < l1.secondId
),
removeIntermediate
--Removed intermediate relationships.
as (
select distinct l1.*
from removeInvalid l1
left join removeInvalid l2 on l2.secondId = l1.firstId
where l2.firstId is null
)
select result.secondId,
dense_rank() over(order by result.firstId) SetId
from (
select firstId,
secondId
from removeIntermediate
union all
select distinct firstId,
firstId
from removeIntermediate
) result;
The 'relations' named result set returns all VendorMasters relationships where they share a common Phone, Address or Fax. It also only returns [A,B] it won't return the reverse relationship [B,A].
The 'recurseLinks' named result set is a little more complex. It recursively joins all rows that are related to each other. The path column keep track of lineage so it won't get stuck in an endless loop. The first query of this union selects all the relations from the 'relations' named result set. The second query of this union selects all the forward recursive relationships, so given [A,B], [B,C] and [C, D] then [A,C], [A,D] and [B,D] are added to the result set. The third query of the union selects all the non forward recursive relationships, so given [A,D], [C,D], [B,C] then [A,C], [A,B] and [B,D] are added to the result set.
The 'removeInvalid' named result set removes any invalid intermediate relationships added by the recursive query. For Example, [B,A] because we will already have [A,B]. Note this could have been prevented in the 'recurseLinks' result set with some effort.
The 'removeIntermediate' named result set removes any intermediate relationships. So given [A,B],[B,C], [C,D], [A,C], [A,D] it will remove [B,C] and [C,D].
The final result set selects the current results and adds in a self relationship. So given [A,B], [A, C], [A,D] add in [A,A]. Which produces are finial result set.
You can use the built in Ranking functions to accomplish this. For example, for unique Address values:
DECLARE #VendorMaster TABLE ( VendorID INT, Vendorname VARCHAR(20), Phone VARCHAR(20), Address VARCHAR(20), Fax VARCHAR(20) )
INSERT INTO #VendorMaster
(VendorID, Vendorname, Phone, Address, Fax )
VALUES
(1, 'AAAA', '10101', 'Street1', '111'),
(2, 'BBBB', '20202', 'Street2', '222'),
(3, 'CCCC', '30303', 'Street3', '333'),
(4, 'DDDD', '40404', 'Street2', '444'),
(5, 'FFFF', '50505', 'Street5', '555'),
(6, 'GGGG', '60606', 'Street6', '444'),
(7, 'HHHH', '10101', 'Street6', '777')
SELECT
DenseRank = DENSE_RANK() OVER ( ORDER BY Address )
,* FROM #VendorMaster
Results
DenseRank RowNumber VendorID Vendorname Phone Address Fax
1 1 1 AAAA 10101 Street1 111
2 2 2 BBBB 20202 Street2 222
3 3 3 CCCC 30303 Street3 333
2 4 4 DDDD 40404 Street2 444
4 5 5 FFFF 50505 Street5 555
5 6 6 GGGG 60606 Street6 444
5 7 7 HHHH 10101 Street6 777
If these SetId values need to persist, you could create a separate table with an identity column to track the values associated with each SetID for each set up. It sounds like you may simply want to normalize your database and break out data elements being duplicated into their own tables linked by an identity column relationship.
Although Wills answer is pretty ingenious, I've never really like recursive CTE's very much because they always work great on small sets but become very slow on larger ones quickly and sometimes will hit the MAXRECURSION limit.
Personally I'd try to solve this by first putting every VendorID in its own SetID and then merge the upper SetIDs into lower SetIDs that have a matching Vendor.
It would then look something like this:
-- create test-code
IF OBJECT_ID('VendorMaster') IS NOT NULL DROP TABLE VendorMaster
GO
CREATE TABLE VendorMaster
([VendorID] int IDENTITY(1,1) PRIMARY KEY, [Vendorname] nvarchar(100), [Phone] nvarchar(100) , [Address] nvarchar(100), [Fax] nvarchar(100))
;
INSERT INTO VendorMaster
([Vendorname], [Phone], [Address], [Fax])
VALUES
('AAAA', '10101', 'Street1', '111'),
('BBBB', '20202', 'Street20', '222'),
('CCCC', '30303', 'Street3', '333'),
('DDDD', '40404', 'Street2', '444'),
('FFFF', '50505', 'Street5', '555'),
('GGGG', '60606', 'Street6', '444'),
('HHHH', '10101', 'Street6', '777'),
('IIII', '80808', 'Street20', '888'),
('JJJJ', '90909', 'Street9', '888');
GO
-- create sets and start shifting & merging
DECLARE #rowcount int
SELECT SetID = 1000 + ROW_NUMBER() OVER (ORDER BY VendorID),
VendorID
INTO #result
FROM VendorMaster
SELECT #rowcount = ##ROWCOUNT
CREATE UNIQUE CLUSTERED INDEX uq0 ON #result (VendorID)
WHILE #rowcount > 0
BEGIN
-- find lowest SetID that has a match with current record
;WITH shifting
AS (SELECT newSetID = Min(n.SetID),
oldSetID = o.SetID
FROM #result o
JOIN #result n
ON n.SetID < o.SetID
JOIN VendorMaster vo
ON vo.VendorID = o.VendorID
JOIN VendorMaster vn
ON vn.VendorID = n.VendorID
WHERE vn.Vendorname = vo.Vendorname
OR vn.Phone = vo.Phone
OR vn.Address = vo.Address
OR vn.Fax = vo.Fax
GROUP BY o.SetID)
UPDATE #result
SET SetID = s.newSetID
FROM #result upd
JOIN shifting s
ON s.oldSetID = upd.SetID
AND s.newSetID < upd.SetID
SELECT #rowcount = ##ROWCOUNT
END
-- delete 'single-member-sets' for consistency in compare with CTE of Will
DELETE #result
FROM #result del
WHERE NOT EXISTS ( SELECT *
FROM #result xx
WHERE xx.SetID = del.SetID
AND xx.VendorID <> del.VendorID)
-- fix 'holes'
UPDATE #result
SET SetID = 1 + (SELECT COUNT(DISTINCT SetID)
FROM #result xx
WHERE xx.SetID < upd.SetID)
FROM #result upd
-- show result
SELECT * FROM #result ORDER BY SetID, VendorID
When running this on the test-case provided, I get the same results as the CTE, although it takes a bit longer.
When I add some extra test-data, things become interesting though.
DECLARE #counter int = 7
WHILE #counter > 0
BEGIN
INSERT VendorMaster ([Vendorname], [Phone], [Address], [Fax])
SELECT [Vendorname] = NewID(),
[Phone] = ABS(BINARY_CHECKSUM(NewID())) % 1500,
[Address] = NewID(),
[Fax] = NewID()
FROM VendorMaster
SELECT #counter = #counter - 1
END
SELECT COUNT(*) FROM VendorMaster
This gives me 1152 test-records with the matches we already had before, but now also with some matches on Phone (the NewID()'s will not ever match) to make things more easy to verify.
When I run the query above on this, I get 604 sets in just shy under 2 seconds. However, when I run the CTE on it, it

Split string, copy data into multiple tables

I have the following tables
Table A
RID | Name |Phone |Email |CreatedOn
------------------------------------------------------------
1 | John Smith | 2143556789 |t1#gmail.com |2012-09-01 09:30:00
2 | Jason K Crull | 2347896543 |t2#gmail.com |2012-08-02 10:34:00
Table B
CID| FirstName |LastName |Phone |Email |CreatedOn |Title|Address|City|State
---------------------------------------------------------------------------------------------------
11 | John | Smith |2143556789 |t1#gmail.com |2012-09-01 09:30:00|NULL|NULL|NULL|NULL
12 | Jason | K Crull |2347896543 |t2#gmail.com |2012-08-02 10:34:00|NULL|NULL|NULL|NULL
Table C
RID | CID |IsAuthor|CreatedOn
-----------------------------------------
1 | 11 | 0 |2012-09-01 09:30:00
2 | 12 | 0 |2012-08-02 10:34:00
For every row in "Table A" need to create a row in "Table B" splitting the name into First and Last Name as shown and after creating a row, insert new row into Table C with RID from Table A, CID from Table B, IsAuthor bit Default to 0 and CreatedOn from Table A.The CID is auto incremented. Can anyone help me in achieving this. I am very new to SQL. Thanks!
I believe you're looking for something like this (I left off some fields, but this should get the point across). Main thing to see is the substring and charindex functions which are used to split the name into first name and last names:
insert into tableb (firstname,lastname,phone,email)
select
left(name, charindex(' ',name)-1),
substring(name, charindex(' ', name)+1, len(name)),
phone, email
from tablea ;
insert into tablec
select a.rid, b.cid, 0, a.createdon
from tablea a
inner join tableb b on a.name = b.firstname + ' ' + b.lastname
and a.phone = b.phone and a.email = b.email ;
SQL Fiddle Demo
If there is a concern for the same names, emails, etc, then you're probably going to need to look into using a dreaded cursor and scope_identity(). Hopefully you won't have to go down that route.
Stop thinking about "for every row" and think of it as "a set." Very rarely is it efficient to process anything row-by-row in SQL Server, and very rarely is it beneficial to think in those terms.
--INSERT dbo.TableC(RID, CID, IsAuthor, CreatedOn)
SELECT a.RID, b.CID, IsAuthor = 0, a.CreatedOn
FROM dbo.TableA AS a
INNER JOIN dbo.TableB AS b
ON a.Name = b.FirstName + ' ' b.LastName;
When you believe it is returning the right results, uncomment the INSERT.
To split the name I'd use CharIndex to find the position of the space, then Substring to break the word apart.
For keeping track of which row in TableA the data in TableB came from, I'd just stick a column onto B to record this data, then drop it when you come to inset into table C.
An alternative would be to make CID an identity column on C instead of B, populate C first, then feed that data into TableB when you come to populate that.
if OBJECT_ID('TableA','U') is not null drop table TableA
create table TableA
(
rid int not null identity(1,1) primary key clustered
, Name nvarchar(64)
, Phone nvarchar(16)
, Email nvarchar(256)
, CreatedOn datetime default (getutcdate())
)
if OBJECT_ID('TableB','U') is not null drop table TableB
create table TableB
(
cid int not null identity(1,1) primary key clustered
, FirstName nvarchar(64)
, LastName nvarchar(64)
, Phone nvarchar(16)
, Email nvarchar(256)
, CreatedOn datetime default (getutcdate())
, Title nvarchar(16)
, [Address] nvarchar(256)
, City nvarchar(64)
, [State] nvarchar(64)
)
if OBJECT_ID('TableC','U') is not null drop table TableC
create table TableC
(
rid int primary key clustered
, cid int unique
, IsAuthor bit default(0)
, CreatedOn datetime default (getutcdate())
)
insert TableA (Name, Phone, Email) select 'John Smith', '2143556789', 't1#gmail.com'
insert TableA (Name, Phone, Email) select 'Jason K Crull', '2347896543', 't2#gmail.com'
alter table TableB
add TempRid int
insert TableB(FirstName, LastName, Phone, Email, TempRid)
select case when CHARINDEX(' ', Name) > 0 then SUBSTRING(Name, 1, CHARINDEX(' ', Name)-1) else Name end
, case when CHARINDEX(' ', Name) > 0 then SUBSTRING(Name, CHARINDEX(' ', Name)+1, LEN(Name)) else '' end
, Phone
, Email
, Rid
from TableA
insert TableC (rid, cid)
select TempRid, cid
from TableB
alter table TableB
drop column TempRid
select * from TableB
select * from TableC
Try it here: http://sqlfiddle.com/#!3/aaaed/1
Or the alternate method (inserting to C before B) here: http://sqlfiddle.com/#!3/99592/1