Join query result in duplicated rows - sql

-----------tblDListTest---------
id listid trackingcode
1 125 trc1
2 125 trc1
3 125 trc1
4 126 trc4
5 126 trc5
---------------------------------
---------tblTrcWeightTest----------
id weight trackingcode
1 20 trc1
2 30 trc1
3 40 trc1
4 50 trc4
5 70 trc5
Need to display trackingcode and with their weight.
In tblDListTest, there are 3 records against listid 125.
I want to display only 3 records with weight.
I am using query :
set transaction isolation level read uncommitted
select DL.id, DL.listid, DL.trackingcode, tw.weight
from tblDListTest DL
inner join tblTrcWeightTest tw on DL.trackingcode = tw.trackingcode
where DL.listid = 125
My query result :
id listid trackingcode weight
1 125 trc1 20
1 125 trc1 30
1 125 trc1 40
2 125 trc1 20
2 125 trc1 30
2 125 trc1 40
3 125 trc1 20
3 125 trc1 30
3 125 trc1 40
But I want following result .
id listid trackingcode weight
1 125 trc1 20
2 125 trc1 30
3 125 trc1 40

you need a unique key (any combination of fields that results on a unique value) in one of the tables.
In your example, trc1 appears 3 times in each table.
SQL doen't know to join this data, so, it will make a cartesian product of the possible combinations.
If you can't use a unique value in the join, you can use a SELECT DISTINCT DL.id, DL.listid, DL.trackingcode, tw.weight ....

There are duplicates between your tables. You would want to see something like this:
;WITH DL (id, listid, trackingcode) AS (
SELECT CONVERT(int, id), listid, trackingcode FROM (
VALUES
('1','125','trc1'),
('2','125','trc1'),
('3','125','trc1'),
('4','126','trc4'),
('5','126','trc5')
) AS A (id, listid, trackingcode)
),
tw (id, weight, trackingcode) AS (
SELECT CONVERT(int, id), weight, trackingcode FROM (
VALUES
('1','20','trc1'),
('2','30','trc1'),
('3','40','trc1'),
('4','50','trc4'),
('5','70','trc5')
) AS A (id, weight, trackingcode)
)
SELECT DISTINCT DL.listid,
DL.trackingcode,
tw.weight
FROM DL
INNER JOIN tw ON DL.trackingcode = tw.trackingcode
WHERE DL.listid = 125

You can use row_number() to enumerate the values and then use that for the join:
select dl.id, dl.listid, dl.trackingcode, tw.weight
from (select dl.*, row_number() over (partition by trackingcode order by id) as seqnum
from tblDListTest dl
) dl inner join
(select tw.*, row_number() over (partition by trackingcode order by id) as seqnum
from tblTrcWeightTest tw
) tw
on dl.trackingcode = tw.trackingcode and dl.seqnum = tw.seqnum
where dl.listid = 125;

You can just use something like this.
DECLARE #tblDListTest table (
ID INT,
listid INT,
trackingcode VARCHAR(20)
)
DECLARE #tblTrcWeightTest table (
ID INT,
weight INT,
trackingcode VARCHAR(20)
)
INSERT INTO #tblDListTest (ID,listid,trackingcode)
VALUES (1, 125, 'trc1'),
(2, 125, 'trc1'),
(3, 125, 'trc1'),
(4, 126, 'trc4'),
(5, 126, 'trc5')
INSERT INTO #tblTrcWeightTest (ID,weight,trackingcode)
VALUES (1, 20, 'trc1'),
(2, 30, 'trc1'),
(3, 40, 'trc1'),
(4, 50, 'trc4'),
(5, 70, 'trc5')
SELECT A.ID, A.listid, A.trackingcode, B.weight
FROM #tblDListTest A
JOIN #tblTrcWeightTest B ON B.ID = A.ID
WHERE A.listid = 125

You can use subquery :
select twt.id, tt.listid, twt.trackingcode, twt.weight
from tblTrcWeightTest twt cross apply (
select top 1 tdt.listid
from tblDListTest tdt
where tdt.trackingcode = twt.trackingcode
) tt
where twt.trackingcode = 'trc1';

Related

Break up running sum into maximum group size / length

I am trying to break up a running (ordered) sum into groups of a max value. When I implement the following example logic...
IF OBJECT_ID(N'tempdb..#t') IS NOT NULL DROP TABLE #t
SELECT TOP (ABS(CHECKSUM(NewId())) % 1000) ROW_NUMBER() OVER (ORDER BY name) AS ID,
LEFT(CAST(NEWID() AS NVARCHAR(100)),ABS(CHECKSUM(NewId())) % 30) AS Description
INTO #t
FROM sys.objects
DECLARE #maxGroupSize INT
SET #maxGroupSize = 100
;WITH t AS (
SELECT
*,
LEN(Description) AS DescriptionLength,
SUM(LEN(Description)) OVER (/*PARTITION BY N/A */ ORDER BY ID) AS [RunningLength],
SUM(LEN(Description)) OVER (/*PARTITION BY N/A */ ORDER BY ID)/#maxGroupSize AS GroupID
FROM #t
)
SELECT *, SUM(DescriptionLength) OVER (PARTITION BY GroupID) AS SumOfGroup
FROM t
ORDER BY GroupID, ID
I am getting groups that are larger than the maximum group size (length) of 100.
A recusive common table expression (rcte) would be one way to resolve this.
Sample data
Limited set of fixed sample data.
create table data
(
id int,
description nvarchar(20)
);
insert into data (id, description) values
( 1, 'qmlsdkjfqmsldk'),
( 2, 'mldskjf'),
( 3, 'qmsdlfkqjsdm'),
( 4, 'fmqlsdkfq'),
( 5, 'qdsfqsdfqq'),
( 6, 'mds'),
( 7, 'qmsldfkqsjdmfqlkj'),
( 8, 'qdmsl'),
( 9, 'mqlskfjqmlkd'),
(10, 'qsdqfdddffd');
Solution
For every recursion step evaluate (r.group_running_length + len(d.description) <= #group_max_length) if the previous group must be extended or a new group must be started in a case expression.
Set group target size to 40 to better fit the sample data.
declare #group_max_length int = 40;
with rcte as
(
select d.id,
d.description,
len(d.description) as description_length,
len(d.description) as running_length,
1 as group_id,
len(d.description) as group_running_length
from data d
where d.id = 1
union all
select d.id,
d.description,
len(d.description),
r.running_length + len(d.description),
case
when r.group_running_length + len(d.description) <= #group_max_length
then r.group_id
else r.group_id + 1
end,
case
when r.group_running_length + len(d.description) <= #group_max_length
then r.group_running_length + len(d.description)
else len(d.description)
end
from rcte r
join data d
on d.id = r.id + 1
)
select r.id,
r.description,
r.description_length,
r.running_length,
r.group_id,
r.group_running_length,
gs.group_sum
from rcte r
cross apply ( select max(r2.group_running_length) as group_sum
from rcte r2
where r2.group_id = r.group_id ) gs -- group sum
order by r.id;
Result
Contains both the running group length as well as the group sum for every row.
id description description_length running_length group_id group_running_length group_sum
-- ---------------- ------------------ -------------- -------- -------------------- ---------
1 qmlsdkjfqmsldk 14 14 1 14 33
2 mldskjf 7 21 1 21 33
3 qmsdlfkqjsdm 12 33 1 33 33
4 fmqlsdkfq 9 42 2 9 39
5 qdsfqsdfqq 10 52 2 19 39
6 mds 3 55 2 22 39
7 qmsldfkqsjdmfqlkj 17 72 2 39 39
8 qdmsl 5 77 3 5 28
9 mqlskfjqmlkd 12 89 3 17 28
10 qsdqfdddffd 11 100 3 28 28
Fiddle to see things in action (includes random data version).

Find the most recent record from the table for the given criteria

It may seem like a duplicate question, but all the answers I found on SO didn't help me solve this.
So, I have this database that stores every update on an item. Essentially, when the item is first created, the statusId is 1 and the date it's created. If someone updated the item and changed the status of the item, the statusId for that item is added. For eg. a new row with statusId 2 is added with the current date. And so on and so forth. One example of the table is shown below:
id statusId updatedDate userId authId
1 1 2016-12-20 15:43:17.703 14 14
2 1 2016-12-20 15:54:01.523 14 15
3 2 2016-12-21 16:05:48.157 14 14
4 3 2016-12-21 16:27:58.610 14 15
5 1 2016-12-20 17:16:47.627 14 18
6 1 2016-12-20 17:27:58.930 14 19
7 1 2017-01-18 14:13:35.800 18 20
So, what I want to do next is query the table where the most recent statusId is given. For the table above, the query for statusid = 1 should show the following result:
id statusId updatedDate userId authId
5 1 2016-12-20 17:16:47.627 14 18
6 1 2016-12-20 17:27:58.930 14 19
7 1 2017-01-18 14:13:35.800 18 20
Notice how the list doesn't show for authIds 14 and 15 even though it has status 1 but have different statusId in the later date.
One way I tried doing is the following:
select A1.id, A1.statusId, A1.updatedDate, A1.userId, A1.authId from AuthProgressTracker A1
left join AuthProgressTracker A2
on (A1.authId = A2.authId and A1.updatedDate > A2.updatedDate)
where A2.authId is not null
That didn't show the result I was looking for. I tried another one
SELECT *
FROM AuthProgressTracker T
INNER JOIN (
SELECT id, authId, statusId, MAX(updatedDate) as maxDate FROM AuthProgressTracker GROUP BY authId, statusId, id
) AP
ON AP.id = T.id AND T.updatedDate = AP.maxDate
order by T.id
This didn't produce the desired result either.
What am I missing?
And how can I break down the problems in SQL Server 2012 so that I can learn to figure out the problems like this in the future?
Your problem statement may have lead you a bit astray, because while you want the most recent records, the timestamp may not be how you arrive at your result set. In the query below, I use a subquery which identifies all authId which do not have a statusId other than 1. This then filters the original table to leave you with the results you want.
SELECT t1.*
FROM AuthProgressTracker t1
INNER JOIN
(
SELECT authId
FROM AuthProgressTracker
GROUP BY authId
HAVING SUM(CASE WHEN statusId <> 1 THEN 1 ELSE 0 END) = 0
) t2
ON t1.authId = t2.authId
(You haven't stated what RDBMS you're using, so you'll need to adjust your queries accordingly. E.g. If using mysql, use LIMIT syntax instead of TOP.)
declare #AuthProgressTracker table (
id int,
statusId int,
updatedDate datetime,
userId int,
authId int
)
insert into #AuthProgressTracker
values
(1, 1, '2016-12-20 15:43:17.703', 14, 14),
(2, 1, '2016-12-20 15:54:01.523', 14, 15),
(3, 2, '2016-12-21 16:05:48.157', 14, 14),
(4, 3, '2016-12-21 16:27:58.610', 14, 15),
(5, 1, '2016-12-20 17:16:47.627', 14, 18),
(6, 1, '2016-12-20 17:27:58.930', 14, 19),
(7, 1, '2017-01-18 14:13:35.800', 18, 20)
/* Determine id's of latest status updates per authId */
SELECT MAX(id) as LatestSatus
FROM #AuthProgressTracker
GROUP BY authId
/* Note the above includes row id's you've chosen to exclude, so... */
/* Determine most recent statusId */
SELECT TOP 1 statusId
FROM #AuthProgressTracker
ORDER BY updatedDate DESC
/* Putting it all together */
SELECT i.*
FROM #AuthProgressTrackeri
INNER JOIN (
SELECT MAX(id) as LatestSatus
FROM #AuthProgressTracker
GROUP BY authId
) ls ON
ls.LatestSatus = i.id
WHERE i.statusId = (
SELECT TOP 1 statusId
FROM #AuthProgressTracker
ORDER BY updatedDate DESC
)

Complex Query where record is max in set

I have some data that looks like this
CandidateCategory
candidateCategoryId
candidateId
categoryId
I want to return all records where a specific categoryId is the most recent entry, this max(candidateCategoryId)
So if a candidate has 5 categories I want to get that record for say category 23 but only if that is the most recent category added, ie candidateCategoryId is higher than all others for that category.
Using MS SQL 2012
Sample data in format
candidateCategoryId candidateId categoryId
100 1 10
101 1 11
102 1 50
103 1 23
104 1 40
no result, 23 isn't the max candidateCategoryId
candidateCategoryId candidateId categoryId
200 2 20
201 2 31
202 2 12
203 2 23
return result, 23 is the max candidateCategoryId for this candidate.
Try getting the max CandidateCategoryID per CandidateID First, then re-join back
select
yd2.*
from
( select yd.candidateID,
max( yd.candidateCategoryId ) as maxCandCatID
from YourData yd
group by yd.candidateID ) MaxPerID
JOIN YourData yd2
on MaxPerID.candidateID = yd2.candidateID
AND MaxPerID.maxCandCatID = yd2.CandidateCategoryID
AND yd2.categoryID = 23
So, from your sample data, the inner prequery "MaxPerID" will generate two rows...
CandidateID MaxCandCatID (and ultimately corresponds to category ID)
1 104 40
2 203 23
Then, re-joining back to your original table on these two inclusive of your AND CategoryID = 23 will only return the second CandidateID entry
And to help clarify to others who posted answers, the person does not appear to want the highest category ID, but if you look at them, they are sequentially added -- like an auto-incrementing number for the CandidateCategoryID. So, they want the most recent entry for a given candidate (hence candidates 1 & 2)... and if the last entry made was that of category = 23, they want THAT one.
select *
from (select t.*
from tbl t
join (select candidateid,
categoryid,
max(candidatecategoryid) as lastid
from tbl
group by candidateid, categoryid) v
on t.candidateid = v.candidateid
and t.categoryid = v.lastid) x
where categoryid = 23
This is a basic "greatest-of-n" problem. You can solve it with not exists, among other ways:
select t.*
from somedata t
where not exists (select 1
from somedata t2
where t2.categoryId = t.categoryId and
t2.candidateCategoryId > t.candidateCategoryId
);
EDIT:
If you only want categories where the max is 23, then add another condition:
select t.*
from somedata t
where not exists (select 1
from somedata t2
where t2.categoryId = t.categoryId and
t2.candidateCategoryId > t.candidateCategoryId
) and
t.categoryId = 23
Another way to skin this cat. Using your sample data, we can create an inline table for testing and get
DECLARE #candidates TABLE (CandidateCategoryId int,CandidateId int,CategoryId int)
INSERT INTO #candidates
SELECT 100, 1, 10
UNION
SELECT 101, 1, 11
UNION
SELECT 102, 1, 50
UNION
SELECT 103, 1, 23
UNION
SELECT 104, 1, 40
UNION
SELECT 200, 2, 20
UNION
SELECT 201, 2, 31
UNION
SELECT 202, 2, 12
UNION
SELECT 203, 2, 23
SELECT * FROM #candidates c
JOIN
(
SELECT CandidateId,MAX(CategoryId) AS CategoryId FROM #candidates
GROUP BY CandidateId
) tmp
ON c.CandidateId = tmp.CandidateId
AND c.CategoryId = tmp.CategoryId
And get results that look like
CandidateCategoryId | CandidateId | CategoryId
----------------------------------------------
201 | 2 | 31
102 | 1 | 50
I came up with this
select candidateCategoryId
from candidateCategory
where candidateCategoryId in (
select max(candidateCategoryId)
from candidateCategory
group by candidateId )
and categoryId = 23
select *
from yourtable
where candidateCategoryId = (select max(candidateCategoryId) from yourtable)
declare #categoryId int
select #categoryId = 23
with cte as
(
select candidateCategoryId, candidateId, categoryId,
rn = row_number() over (partition by candidateId order by candidateCategoryId desc)
from yourtable
)
select *
from cte c
where exists
(
select *
from cte x
where x.candidateId = c.candidateId
and x.rn = 1
and x.categoryId = #categoryId
)

SQL Counting Duplicates in a Column

I have been stuck on this problem for a while and have searched over the net for an answer..
My problem is:
I have duplicates in one column. I want to count how many duplicates there are in the one column and then I want to divide the a field by that count. I want to be able to do this for each record in the column as well.
Basically I want the script to behave like this
Count number of duplicates -> divide field A by count of duplicates.
Sample data:
t1.Invoiceno | t2.Amount | t2.orderno
-------------------------------------
201412 200 P202
201412 200 P205
302142 500 P232
201412 300 P211
450402 250 P102
450402 250 P142
450402 250 P512
Desired Result:
Invoiceno | Amount | orderno| duplicates|amount_new
-------------------------------------------------
201412 200 P202 2 100
201412 200 P205 2 100
302142 500 P232 1 500
201552 300 P211 1 300
450402 1200 P102 3 400
450402 1200 P142 3 400
450402 1200 P512 3 400
I do not want to insert new columns into the table, I just want the results to show the two new columns.
Here is one way:
select A / dups.dups
from t cross join
(select count(*) as dups
from (select onecol
from t
group by onecol
having count(*) > 1
) o
) dups
EDIT:
Well, now that the problem is clarified to something more reasonable. You can user a similar approach to the above, but the dups subquery needs to be aggregated by invoice and amount:
select amount / dups.dups as new_amount
from table t join
(select invoice, amount, count(*) as dups
from table t
) dups
on t.invoice = dups.invoice and t.amount = dups.amount;
Here is another way:
Declare #tempTable Table ( ID int , A int)
INSERT INTO #tempTable VALUES (1, 12)
INSERT INTO #tempTable VALUES (1, 12)
INSERT INTO #tempTable VALUES (2, 20)
INSERT INTO #tempTable VALUES (2, 24)
INSERT INTO #tempTable VALUES (2, 15)
INSERT INTO #tempTable VALUES (3, 10)
INSERT INTO #tempTable VALUES (5, 12)
-------------------------------------------
;WITH DupsCTE (ID, DuplicateCount) AS
(
SELECT ID, COUNT(*) AS DuplicateCount FROM #tempTable GROUP BY ID
)
SELECT t.ID, t.A,
c.DuplicateCount, t.A / c.DuplicateCount AS ModifiedA
FROM
#tempTable t
INNER JOIN DupsCTE c ON c.ID = t.ID

TSQL Sweepstakes Script

I need to run a sweepstakes script to get X amount of winners from a customers table. Each customer has N participations. The table looks like this
CUSTOMER-A 5
CUSTOMER-B 8
CUSTOMER-C 1
I can always script to have CUSTOMER-A,B and C inserted 5, 8 and 1 times respectively in a temp table and then select randomly using order by newid() but would like to know if there's a more elegant way to address this.
(Update: Added final query.)
(Update2: Added single query to avoid temp table.)
Here's the hard part using a recursive CTE plus the final query that shows "place".
Code
DECLARE #cust TABLE (
CustomerID int IDENTITY,
ParticipationCt int
)
DECLARE #list TABLE (
CustomerID int,
RowNumber int
)
INSERT INTO #cust (ParticipationCt) VALUES (5)
INSERT INTO #cust (ParticipationCt) VALUES (8)
INSERT INTO #cust (ParticipationCt) VALUES (1)
INSERT INTO #cust (ParticipationCt) VALUES (3)
INSERT INTO #cust (ParticipationCt) VALUES (4)
SELECT * FROM #cust
;WITH t AS (
SELECT
lvl = 1,
CustomerID,
ParticipationCt
FROM #Cust
UNION ALL
SELECT
lvl = lvl + 1,
CustomerID,
ParticipationCt
FROM t
WHERE lvl < ParticipationCt
)
INSERT INTO #list (CustomerID, RowNumber)
SELECT
CustomerID,
ROW_NUMBER() OVER (ORDER BY NEWID())
FROM t
--<< All rows
SELECT * FROM #list ORDER BY RowNumber
--<< All customers by "place"
SELECT
CustomerID,
ROW_NUMBER() OVER (ORDER BY MIN(RowNumber)) AS Place
FROM #list
GROUP BY CustomerID
Results
CustomerID ParticipationCt
----------- ---------------
1 5
2 8
3 1
4 3
5 4
CustomerID RowNumber
----------- -----------
4 1
1 2
1 3
2 4
1 5
5 6
2 7
2 8
4 9
2 10
2 11
2 12
1 13
5 14
5 15
3 16
5 17
1 18
2 19
2 20
4 21
CustomerID Place
----------- -----
4 1
1 2
2 3
5 4
3 5
Single Query with No Temp Table
It is possible to get the answer with a single query that does not use a temp table. This works fine, but I personally like the temp table version better so you can validate the interim results.
Code (Single Query)
;WITH List AS (
SELECT
lvl = 1,
CustomerID,
ParticipationCt
FROM #Cust
UNION ALL
SELECT
lvl = lvl + 1,
CustomerID,
ParticipationCt
FROM List
WHERE lvl < ParticipationCt
),
RandomOrder AS (
SELECT
CustomerID,
ROW_NUMBER() OVER (ORDER BY NEWID()) AS RowNumber
FROM List
)
SELECT
CustomerID,
ROW_NUMBER() OVER (ORDER BY MIN(RowNumber)) AS Place
FROM RandomOrder
GROUP BY CustomerID
try this:
Select Top X CustomerId
From (Select CustomerId,
Rand(CustomerId) *
Count(*) /
(Select Count(*)
From Table) Sort
From Table
Group By CustomerId) Z
Order By Sort Desc
EDIT: abovbe assumed multiple rows per customer, one row per participation... Sorry, following assumes one row per customer, with column Participations holding number of participations for that customer.
Select Top 23 CustomerId
From ( Select CustomerId,
Participations - RAND(CustomerId) *
(Select SUM(Participations ) From customers) sort
from customers) Z
Order By sort desc