Marking duplicates records in a table - sql

I am trying to mark duplicate records, however I get wrong reassignment on few on them and I don't know why.
Data:
=FirstName | LastName | Company | Group | Status | ID
x | x | x | NULL | NULL | 1
x | x | x | NULL | NULL | 2
Then I run this query to find matches on FirstName, LastName, Company
and join it back to the main table to mark the records:
with d as (
select ID, FirstName, LAstName, Company, row_number() over (partition by FirstName,LastName, Company order by FirstName,LastName, Company) as nr
from [dbo].xx)
Update b
set Status = 'S'
, Group = d.DQ_ID
from xx as b inner join d on
b.FirstName = d.FirstName and
b.LastNAme = d.LastName and
b.Company = d.Company
where d.nr = 1
And then Update the Main Record with P
Update b
set Status = 'P'
from xx as b
where b.ID = b.Group
GO
What I expect:
=FirstName | LastName | Company | Group | Status | ID
x | x | x | 1 | P | 1
x | x | x | 1 | S | 2
What I get:
=FirstName | LastName | Company | Group | Status | ID
x | x | x | 2 | S | 1
x | x | x | 1 | S | 2
I am working on about 1M records - and it only happen to some of them!

Try this :
;with d as (
select
ID,
FirstName,
LAstName,
Company,
row_number() over (
partition by FirstName,LastName, Company
order by Id asc -- this was done to keep ordering as per ID
) as nr
from [dbo].xx
) ,
e as
(select * from d where nr=1)
-- e was created to only take the nr=1 rows which will be joined to all similar records
Update b
set Status = case when e.DQ_ID = b.DQ_ID then 'P' else 'S' end
-- the set case logic ensures that matching ids get P else S
, Group = e.DQ_ID
from xx as b
inner join e on
b.FirstName = e.FirstName and
b.LastNAme = e.LastName and
b.Company = e.Company

Can try with the following:
;WITH RankedData AS
(
SELECT
T.ID,
T.[Group],
T.Status,
T.FirstName,
T.LastName,
T.Company,
GroupRanking = ROW_NUMBER() OVER (PARTITION BY T.FirstName, T.LastName, T.Company ORDER BY T.ID ASC)
FROM
dbo.xx AS T
)
UPDATE T SET
[Group] = N.ID,
Status = CASE WHEN T.GroupRanking = 1 THEN 'P' ELSE 'S' END
FROM
RankedData AS T
INNER JOIN RankedData AS N ON
T.FirstName = N.FirstName AND
T.LastName = N.LastName AND
T.Company = N.Company AND
N.GroupRanking = 1
Keep in mind that the INNER JOIN will join on not null names and companies, will have to keep in mind if you have nulls on those columns.

Related

How To Pull In Columns From A Derived Table Or Sub Query

I have a query that looks for records that don't have a matching account number and tries to match those accounts by address.
I am getting the results I want, but I want to include columns from the table2 below. How can I do this?
Select DISTINCT
account_num
,product
,accountName
,address_1
,address_2
,city
,state
,zip
,short_address
INTO #Matching_Address
From #Non_Matching_Accounts t
Where EXISTS
(SELECT * FROM (SELECT
left(ADDRESS_LINE1_TXT,20) AS matching_add
,CITY
,STATE
,ZIP
,ACCOUNT_OWNER
From [database].[dbo].[table2]) v (matching_add, CITY, STATE,ZIP,ACCOUNT_OWNER)
WHERE
t.short_address= v.matching_add
AND t.city= v.NAME
AND t.state = v.STATE
AND t.zip = v.ZIP
AND t.accountName LIKE '%'+v.ACCOUNT_OWNER+'%')
I've tried:
Select DISTINCT
account_num
,product
,accountName
,address_1
,address_2
,city
,state
,zip
,short_address
,matching_add
,CITY
,STATE
,ZIP
,ACCOUNT_OWNER
INTO #Matching_Address
From #Non_Matching_Accounts t
Where EXISTS
(SELECT * FROM (SELECT
left(ADDRESS_LINE1_TXT,20) AS Select DISTINCT
account_num
,product
,accountName
,address_1
,address_2
,city
,state
,zip
,short_address
INTO #Matching_Address
From #Non_Matching_Accounts t
Where EXISTS
(SELECT * FROM (SELECT
left(ADDRESS_LINE1_TXT,20) AS matching_add
,CITY
,STATE
,ZIP
,ACCOUNT_OWNER
From [database].[dbo].[table2]) v (matching_add, CITY, STATE,ZIP,ACCOUNT_OWNER)
WHERE
t.short_address= v.matching_add
AND t.city= v.NAME
AND t.state = v.STATE
AND t.zip = v.ZIP
AND t.accountName LIKE '%'+v.ACCOUNT_OWNER+'%')
From [database].[dbo].[table2]) v (matching_add, CITY, STATE,ZIP,ACCOUNT_OWNER)
WHERE
t.short_address= v.matching_add
AND t.city= v.NAME
AND t.state = v.STATE
AND t.zip = v.ZIP
AND t.accountName LIKE '%'+v.ACCOUNT_OWNER+'%')
Expected Results:
acct_num|prd|actName|add1|add2|city|state|zip|act_num2|prd2|actName|add1|add2|city2|state2|zip2|
----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
a | a | a | a | a | a | a | a | a | a | a | a | a | a a| a
b | b | b | b | b | b | b | b | b | b | b | b | b | b | b
c | c | c | c | c | c | c | c | c | c | c | c | c | c | c |
d | d | d | d | d | d | d | d | d | d | d | d | d | d | d |
You're using 'exists' when an 'inner join' is advised. Restructure as follows:
select
distinct t.account_num,
t.product,
t.accountName,
t.address_1,
t.address_2,
t.city,
t.state,
t.zip,
t.short_address,
matching_add = left(v.address_line1_txt,20),
vCity = v.city,
vState = v.state,
vZip = v.zip,
v.account_owner
into #Matching_Address
from #Non_Matching_Accounts t
join [database].[dbo].[table2] v
on t.short_address = v.matching_add
and t.city = v.name
and t.state = v.state
and t.zip = v.zip
and t.accountName like '%' + v.account_owner + '%'
An inner join (or just 'join' for short), will only return matches, so it works like 'exists' in that sense. But it makes the columns from the right-hand table available to you.
My hunch is that you may have tried this. I see a 'distinct' in your query, which probably would not have been necessary with just 'exists'. Did you abandon 'inner join' because it was duplicating your rows? If so, 'exists' is still not the answer. Maybe a cross apply can help you:
select ... (same as above)
into #Matching_Address
from #Non_Matching_Accounts t
cross apply (
select
top 1 *
from [database].[dbo].[table2] v
where t.short_address = v.matching_add
and t.city = v.name
and t.state = v.state
and t.zip = v.zip
and t.accountName like '%' + v.account_owner + '%'
order by v.matching_add -- or whatever puts the better one on top
) v
With 'top 1', The 'v' result will produce no more than 1 record per row in 't'. With 'cross apply', if the result of 'v' is no records, then 't' will not return a row, (similar to 'exists' or 'inner join').

Counting Records with Unique Field Value

Source Table
Assuming I have a table called MyTable with the content:
+----------+------+
| Category | Code |
+----------+------+
| A | A123 |
| A | B123 |
| A | C123 |
| B | A123 |
| B | B123 |
| B | D123 |
| C | A123 |
| C | E123 |
| C | F123 |
+----------+------+
I'm trying to count the number of Code values which are unique to each category.
Desired Result
For the above example, the result would be:
+----------+-------------+
| Category | UniqueCodes |
+----------+-------------+
| A | 1 |
| B | 1 |
| C | 2 |
+----------+-------------+
Since C123 is unique to A, D123 is unique to B, and E123 & F123 are unique to C.
What I've Tried
I'm able to obtain the result for a single category (e.g. C) using a query such as:
SELECT COUNT(a.Code) AS UniqueCodes
FROM
(
SELECT MyTable.Code
FROM MyTable
WHERE MyTable.Category = "C"
) a
LEFT JOIN
(
SELECT MyTable.Code
FROM MyTable
WHERE MyTable.Category <> "C"
) b
ON a.Code = b.Code
WHERE b.Code IS NULL
However, whilst I can hard-code a query for each category, I cannot seem to construct a single query to calculate this for every possible Category value.
Here is what I've tried:
SELECT c.Category,
(
SELECT COUNT(a.Code)
FROM
(
SELECT MyTable.Code
FROM MyTable
WHERE MyTable.Category = c.Category
) a
LEFT JOIN
(
SELECT MyTable.Code
FROM MyTable
WHERE MyTable.Category <> c.Category
) b
ON a.Code = b.Code
WHERE b.Code IS NULL
) AS UniqueCodes
FROM
(
SELECT MyTable.Category
FROM MyTable
GROUP BY MyTable.Category
) c
Though, the c.Category is not defined within the scope of the nested SELECT query.
Could anyone advise how I could obtain the desired result?
I would use NOT EXISTS & do aggregation :
select category, count(*)
from MyTable t
where not exists (select 1 from MyTable t1 where t1.code = t.code and t1.category <> t.category)
group by category;
You can use two levels of aggregation:
select minc as category, count(*)
from (select code, min(category) as minc, max(category) as maxc
from t
group by code
) as c
where minc = maxc
group by minc;
This would also work:
select category, count(*) from(
select a.category, b.count from mytable a join (
select code, count(category) as count
from mytable
group by code
having count(category) = 1
) b on b.code = a.code
) c group by category
Learning from #isaace's answer, I also came up with this -
SELECT MyTable.Category, COUNT(*)
FROM
MyTable INNER JOIN
(SELECT Code FROM MyTable GROUP BY Code HAVING COUNT(Category) = 1) a
ON MyTable.Code = a.Code
GROUP BY MyTable.Category

SQL statement select columns with specific value

I need some help making an sql statement; I don't really know how to aproach the situation. I have two tables, Departments and Employees
from which I want to select the Dpt_num and the Dpt_name of the departments that have at least one employee and that all their employees are from Barcelona
Case 1
== Departments =======
| Dpt_num | Dpt_name |
| 1 | A |
| 2 | B |
== Employees ===================
| E_num | Dpt_num | City |
| 1 | 1 | Barcelona |
| 2 | 1 | Barcelona |
The result in this case should be
Dpt_num Dpt_name
------------------
1 A
Case 2
== Departments =======
| Dpt_num | Dpt_name |
| 1 | A |
| 2 | B |
== Employees ==================
| E_num | Dpt_num | City |
| 1 | 1 | Barcelona |
| 2 | 1 | Madrid |
The result in this case should be empty.
I tried this for example but it seems very inefficient and it does not work in all the cases
select
num_dpt, nom_dpt
from
departements
where
1 = (select count(distinct e.ciutat_empl)
from empleats e
where e.num_dpt = num_dpt)
and not exists (select * from empleats e
where e.ciutat_empl != 'BARCELONA' and e.num_dpt = num_dpt);
I really appreciate any help. Thanks!
You want to go down the path of doing the filtering in the where clause. Then, use exists and not exists:
select d.num_dpt, d.nom_dpt
from departaments d
where exists (select 1
from empleats e
where e.num_dpt = d.num_dpt and e.ciutat_empl = 'BARCELONA'
) and
not exists (select 1
from empleats e
where e.num_dpt = d.num_dpt and e.ciutat_empl <> 'BARCELONA'
);
The first condition checks that at least one employee is from Barcelona. The second checks that no employees are from any other city.
One major problem in your version is your correlation clause:
e.num_dpt = num_dpt
You think this is doing:
e.num_dpt = departaments.num_dpt
But it is really doing:
e.num_dpt = e.num_dpt
Always qualify your column names. This is especially important when you have more than one table reference in the query.
Join the tables, group by the department and check if the count of employees in Barcelona is equal to the count of all employess of the department.
SELECT d.dpt_num,
d.dpt_name
FROM departments d
INNER JOIN employees e
ON e.dpt_num = d.dpt_num
GROUP BY d.dpt_num,
d.dpt_name
HAVING count(CASE
WHEN e.city = 'Barcelona' THEN
1
END) = count(*);
I believe this should work:
select d.dpt_num, d.dpt_name
from departments d
inner join employees e on
d.dpt_num = e.dpt_num
group by d.dpt_num, d.dpt_name
having count(*) = sum(case when e.city = 'Barcelona' then 1 else 0 end)
INNER JOIN makes sure there's at least 1 employee
HAVING count(*) = sum(case when e.city = 'Barcelona' then 1 else 0 end) makes sure that all employees are from Barcelona
demo: db<>fiddle
SELECT dpt_num, dpt_name
FROM (
SELECT d.dpt_num, d.dpt_name, array_agg(city) as cities
FROM dept d
JOIN empl e
ON d.dpt_num = e.dpt_num
GROUP BY d.dpt_num, d.dpt_name
) s
WHERE 'Barcelona' = ALL(cities)
Aggregate the cities and then you can filter with the ALL operator which checks if all array elements fit the condition.
Generally speaking, you compare COUNT(*) with COUNT(some condition) for such problems:
SELECT *
FROM Departments
WHERE EXISTS (
SELECT 1
FROM Employees
WHERE Employees.Dpt_num = Departments.Dpt_num
HAVING COUNT(*) > 0 -- it is possible to get a 0 if where did not match
AND COUNT(*) = COUNT(CASE WHEN Employees.City = 'Barcelona' THEN 1 END)
)
DB Fiddle
Pl try query below
select a.dpt_number,a.dpt_name from yy_department a
where exists (select 'x' from yy_employees y where y.dpt_number = a.dpt_number and y.city = 'Barcelona')
and not exists (select 'x' from yy_employees y where y.dpt_number = a.dpt_number and nvl(y.city,'x') <> nvl('Barcelona','y'))

SQL SELECT multiple keys/values

I've got a table PERSON_PROPERTIES that resembles the following :
| ID | KEY | VALUE | PERSON_ID |
| 1 | fname | robert | 1 |
| 2 | lname | redford | 1 |
| 3 | fname | robert | 2 |
| 4 | lname | de niro | 2 |
| 5 | fname | shawn | 3 |
| 6 | nname | redford | 3 |
I would like to SELECT (in JPQL or in PSQL) the PERSON_ID that matches the given fname and lname.
I've tried
`SELECT DISTINCT *
FROM PERSON_PROPERTIES t0
WHERE ((((t0.key = 'fname')
AND (t0.value = 'robert'))
AND ((t0.key = 'lname')
AND (t0.value = 'redford'))))`
but it returns me no value.
I've also tried
`SELECT DISTINCT *
FROM PERSON_PROPERTIES t0
WHERE ((((t0.key = 'fname')
AND (t0.value = 'robert'))
OR ((t0.key = 'lname')
AND (t0.value = 'redford'))))`
but this way it returns me all values. I don't know how to turn the query properly for it to give me only value 1.
SELECT PERSON_ID
FROM PERSON_PROPERTIES
group by PERSON_ID
having sum(case when key = 'fname' and value = 'robert' then 1 else 0 end) > 0
and sum(case when key = 'lname' and value = 'redford' then 1 else 0 end) > 0
Groupy by the person and select only those having both values.
Another approach would be with subselect (caution, it's MS SQL 2012)
SELECT PERSON_ID
FROM PERSON_PROPERTIES
WHERE [Key] = 'fname' AND value = 'robert'
AND PERSON_ID in
(SELECT PERSON_ID FROM PERSON_PROPERTIES WHERE [Key] = 'lname' AND value = 'redford')
Fiddle Demo
Along with some colleagues we came to this answer :
SELECT p.PERSON_ID
FROM PERSON_PROPERTIES p
WHERE (p.key = 'fname' AND p.value = 'robert')
OR (p.key = 'lname' AND p.value = 'redford')
GROUP BY p.PERSON_ID
HAVING count(*) = 2
What do you think about it?
SELF JOIN also does the trick. DISTINCT for duplicate person_id:
SELECT DISTINCT a.PERSON_ID
FROM PERSON_PROPERTIES a JOIN PERSON_PROPERTIES b ON a.PERSON_ID = b.PERSON_ID
WHERE a.the_key = 'fname' AND a.value = 'robert'
AND b.the_key = 'lname' AND b.value = 'redford';
Demo
OK I will be marking this as the correct answer. The only thing I did was modified it a bit
SELECT Y.*, M.* FROM wp_postmeta as Y JOIN wp_postmeta AS M USING (`post_id`)
WHERE (Y.meta_key = 'agam_post_options_year' AND Y.meta_value = 2013)
AND (M.meta_key = 'agam_post_options_month' AND M.meta_value BETWEEN 0 AND 12 )
GROUP BY Y.meta_value, M.meta_value ORDER BY M.meta_value+0 DESC
So I get that DESC order.. however.. I noticed that it does not duplicates results... I had two posts with the same year and same month... now I don't see it... is there anything there that's preventing this ?

How to group using exists

I have the following table
personid talent
1 swim
2 play
1 play
1 swim
2 play
3 swim
3 swim
2 play
So person 1 can both swim and play. Person 2 can only play. Person 3 can only swim.
I need to get the following result
personid talent
1 both
2 play
3 swim
How can I do this using exists ?
I tried
SELECT DISTINCT personid,
CASE WHEN (EXISTS(
SELECT * FROM mytable
-- I got stuck
PS : I have a long solution that works . . But i do not like it because it is long
SELECT DISTINCT dis2.personid , CASE WHEN talcount = 2 THEN 'both'
ELSE talent END AS talent
FROM
(
SELECT personid , COUNT(talent) talcount
FROM
(
SELECT DISTINCT personid , talent
FROM my_table
) AS dis
GROUP BY personid
) dis2
JOIN my_table dis3
ON dis2.personid = dis3.personid
Do you really need to use EXISTS?
SELECT
personid,
CASE
WHEN COUNT(DISTINCT talent) = 2 THEN 'both'
ELSE MIN (talent)
END
FROM talents
GROUP BY personid
You could use the WITH clause to achieve the same effect:
WITH
DISTINCT_TALENTS(PERSONID, TALENT) AS
(SELECT DISTINCT PERSONID, TALENT
FROM TALENTS)
SELECT DISTINCT PERSONID, TALENT
FROM
(SELECT A.PERSONID,
CASE WHEN TALENT_COUNT = 2 THEN 'BOTH' ELSE A.TALENT END
FROM
DISTINCT_TALENTS A
INNER JOIN
(SELECT PERSONID, COUNT(TALENT) TALENT_COUNT
FROM DISTINCT_TALENTS
GROUP BY PERSONID) B
ON A.PERSONID = B.PERSONID)
First you create a virtual DISTINCT_TABLES table:
+------------------+
| personid talent |
+------------------+
| 1 play |
| 1 swim |
| 2 play |
| 3 swim |
+------------------+
next you create a subquery b with the following
+------------------------+
| personid talent_count |
+------------------------+
| 1 2 |
| 2 1 |
| 3 1 |
+------------------------+
you join with original DISTINCT_TALENTS to obtain
+----------+--------+--------------+
| personid | talent | talent_count |
+----------+--------+--------------+
| 1 | both | 2 |
| 1 | both | 2 |
| 2 | play | 1 |
| 3 | swim | 1 |
+----------+--------+--------------+
you take the distinct personid, talent to obtain the final result.
A solution similar to using exists is:
SELECT DISTINCT PERSONID, TALENT
FROM
(
SELECT
B.PERSONID,
CASE
WHEN A.TALENT IS NULL THEN 'swim'
WHEN B.TALENT IS NULL THE 'play'
ELSE 'both'
END TALENT
FROM
TALENTS A
FULL OUTER JOIN
TALENTS B
ON A.PERSONID = B.PERSONID
AND A.TALENT='play'
AND B.TALENT='swim'
)
And finally, also with the EXISTS function used like a lookup function:
SELECT DISTINCT PERSONID, TALENT
FROM (
SELECT A.PERSONID,
CASE
WHEN A.TALENT = 'play' AND EXISTS (SELECT 1 FROM TALENTS B WHERE A.PERSONID = B.PERSONID AND B.TALENT = 'swim')
THEN 'both'
WHEN A.TALENT = 'swim' AND EXISTS (SELECT 1 FROM TALENTS B WHERE A.PERSONID = B.PERSONID AND B.TALENT = 'play')
THEN 'both'
ELSE
A.TALENT
END TALENT
FROM
TALENTS A)