How to COUNT duplicate rows? - sql

I want to be able to create a histogram out of a tuple containing two integers values.
Here it is the query:
SELECT temp.ad_id, temp.distance as hits FROM (
'UNION ALL .join(cupound_query)' # python
) as temp GROUP BY temp.ad_id,temp.distance
For this input:
(51, 5)
(51, 0)
(51, 3)
(51, 0)
(88, 2)
(88, 2)
(88, 2)
(84, 1)
(81, 9)
Would be:
(88,2) : 3
(51,0) : 2
(51,3) : 1
(51,5) : 1
(84,1) : 1
(81,9) : 1
How can I create a histogram of those values?
In other words, how can I count how many times a row has a duplicate?

The question leaves room for interpretation. This test case shows 2 nested steps:
CREATE TABLE tbl (ad_id int, distance int);
INSERT INTO tbl VALUES
(510, 0), (956, 3), (823, 3), (880, 2)
, (523, 3), (467, 0), (843, 1), (816, 9)
, (533, 4), (721, 7), (288, 3), (900, 3)
, (526, 9), (750, 7), (302, 8), (463, 6)
, (742, 8), (804, 2), (62, 7), (880, 2)
, (523, 3), (467, 0), (843, 1), (816, 9)
, (533, 4), (721, 7), (288, 3), (900, 3)
, (526, 9), (750, 7), (302, 8), (816, 9)
, (533, 4), (721, 7), (288, 3), (900, 3)
, (533, 4), (721, 7), (288, 3), (396, 5)
;
How many duplicates per value?
SELECT ad_id, count(*) AS ct FROM tbl GROUP BY 1;
Result:
ad_id | ct
-------+----
62 | 1
288 | 4
302 | 2
396 | 1
...
Read: ad_id 62 exists 1x, ad_id 288 exists 4x, ...
How to count how many times rows have duplicates?
SELECT ct, count(*) AS ct_ct
FROM (SELECT ad_id, count(*) AS ct FROM tbl GROUP BY 1) sub
GROUP BY 1
ORDER BY 1;
Result:
ct | ct_ct
----+-------
1 | 8
2 | 7
3 | 2
4 | 3
Read: 8 occurrences of "ad_id is unique", 7 occurrences of "2 rows with same ad_id", ...
db<>fiddle here

Just add count(*) to your select:
SELECT temp.ad_id, temp.distance as hits, count(*)
....

Related

how to calculate consecutive difference using values of two columns? [duplicate]

I'm working on a data structure with list of positive or negative result for each person.
Sample data (id is an identity):
id person result
1 1 0
2 1 1
3 1 1
4 2 1
5 2 0
6 1 1
7 1 0
8 2 0
9 2 0
10 2 0
With this I would like to count the maximum number of consecutive result = 1 for each person. The result in this sample would be
person max_count
1 3
2 1
I have tried using ROW_NUMBER() OVER (PARTITION BY) like this
SELECT person,
ROW_NUMBER() OVER (PARTITION BY person, result ORDER BY id) AS max_count
FROM TABLE
but it gives me an accumulative count instead of consecutive one.
What should I do to perform a consecutive count? Any hint would be appreciated. Thanks in advance
This looks like classic gaps-and-islands problem.
Examine intermediate results of each CTE in the query below to understand what is going on.
Sample data
I added person 3 with two sequences of positive results, so that we could find the longest sequence.
DECLARE #T TABLE (id int, person int, result int);
INSERT INTO #T (id, person, result) VALUES
(1 , 1, 0),
(2 , 1, 1),
(3 , 1, 1),
(4 , 2, 1),
(5 , 2, 0),
(6 , 1, 1),
(7 , 1, 0),
(8 , 2, 0),
(9 , 2, 0),
(10, 2, 0),
(11, 3, 0),
(12, 3, 1),
(13, 3, 1),
(14, 3, 1),
(15, 3, 1),
(16, 3, 0),
(17, 3, 1),
(18, 3, 1),
(19, 3, 0),
(20, 3, 0);
Query
WITH
CTE_RowNumbers
AS
(
SELECT
id, person, result
,ROW_NUMBER() OVER (PARTITION BY person ORDER BY ID) AS rn1
,ROW_NUMBER() OVER (PARTITION BY person, result ORDER BY ID) AS rn2
FROM #T
)
,CTE_Groups
AS
(
SELECT
id, person, result
,rn1-rn2 AS GroupNumber
FROM CTE_RowNumbers
)
,CTE_GroupSizes
AS
(
SELECT
person
,COUNT(*) AS GroupSize
FROM CTE_Groups
WHERE
result = 1
GROUP BY
person
,GroupNumber
)
SELECT
person
,MAX(GroupSize) AS max_count
FROM CTE_GroupSizes
GROUP BY person
ORDER BY person;
Result
+--------+-----------+
| person | max_count |
+--------+-----------+
| 1 | 3 |
| 2 | 1 |
| 3 | 4 |
+--------+-----------+
by using Case and SUM we can achieve the above result
DECLARE #T TABLE (id int, person int, result int);
INSERT INTO #T (id, person, result) VALUES
(1 , 1, 0),
(2 , 1, 1),
(3 , 1, 1),
(4 , 2, 1),
(5 , 2, 0),
(6 , 1, 1),
(7 , 1, 0),
(8 , 2, 0),
(9 , 2, 0),
(10, 2, 0)
select
person,
SUM(CASE WHEN RESULT = 1 then 1 else 0 END)
from #T
GROUP BY person

How to complete this specific question/query using SQL and an inline view?

I'm currently working on a hotel project where I need to make queries with SQL but I'm stuck on one question.
The question is:
How many employees have made at least 2 bookings for at least 3 customers?
I have figured out that I need to use an inline view but I have not gone any further because I'm stuck on the next part.
This is the table in the database:
bookingid | int | primary key
bookingdate | date| -
numOfGuests | int | -
customerId | int | foreign key
employeeId | int | foreign key
bookingid | bookingdate | numOfGuests | customerId | employeeId
1 2016-01-25 4 2 2
2 2016-06-12 1 3 2
3 2016-12-05 1 2 2
4 2016-04-01 2 3 2
5 2016-11-01 3 2 3
6 2016-11-03 1 8 2
7 2017-06-02 6 2 2
8 2016-02-07 2 8 2
9 2016-12-25 2 4 5
10 2017-06-21 1 10 2
11 2016-08-12 2 10 2
... ... ... ... ...
So does anyone know how to complete this question with a SQL query using an inline view?
The result I want are the employeeId's or id that satisfies the specifications of the question: Result based on sample data
CountOfemployeeID |
1
Please check this script-
SELECT COUNT(DISTINCT C.employeeId)
FROM
(
SELECT A.employeeId,B.customerid,COUNT(B.bookingid) T
FROM (
--Select users who atlease booked for 3 customer
SELECT employeeId,COUNT(DISTINCT customerid) customerid
FROM Table1
GROUP BY employeeId
HAVING COUNT(customerid)> 2
)A
--Select users who atleast booked twice per customer
INNER JOIN (
SELECT bookingid,bookingdate,numOfGuests,customerId,employeeId
FROM Table1
) B
ON A.employeeId = B.employeeId
GROUP BY A.employeeId,B.customerid
HAVING COUNT(B.bookingid) > 1
)C
declare #userData TABLE(
bookingid int,
bookingdate date,
numOfGuests int,
customerId int,
employeeId int
)
insert into #userData
values
(1, '2016-01-25', 4, 2, 2),
(2, '2016-06-12', 1, 3, 3),
(3, '2016-12-05', 1, 2, 4),
(4, '2016-04-01', 2, 2, 3),
(5, '2016-11-12', 3, 2, 3),
(6, '2017-01-15', 1, 5, 5),
(6, '2017-01-15', 1, 5, 5),
(6, '2017-01-15', 1, 5, 5),
(6, '2017-01-15', 1, 5, 5),
(6, '2017-01-15', 1, 5, 5),
(6, '2017-01-15', 1, 5, 5),
(1, '2016-01-25', 4, 2, 2),
(2, '2016-06-12', 1, 3, 3),
(3, '2016-12-05', 1, 2, 4),
(4, '2016-04-01', 2, 2, 3),
(5, '2016-11-12', 3, 2, 3),
(6, '2017-01-15', 1, 2, 5),
(6, '2017-01-15', 1, 2, 5),
(6, '2017-01-15', 1, 3, 5),
(6, '2017-01-15', 1, 3, 5),
(6, '2017-01-15', 1, 4, 5),
(6, '2017-01-15', 1, 4, 5),
(1, '2016-01-25', 4, 2, 2),
(2, '2016-06-12', 1, 3, 3),
(3, '2016-12-05', 1, 2, 4),
(4, '2016-04-01', 2, 2, 3),
(5, '2016-11-12', 3, 2, 3),
(6, '2017-01-15', 1, 1, 5),
(6, '2017-01-15', 1, 2, 5),
(6, '2017-01-15', 1, 3, 5),
(6, '2017-01-15', 1, 4, 5),
(6, '2017-01-15', 1, 7, 5),
(6, '2017-01-15', 1, 6, 5),
(1, '2016-01-25', 4, 3, 2),
(1, '2016-01-25', 4, 3, 2),
(1, '2016-01-25', 4, 1, 2),
(1, '2016-01-25', 4, 1, 2)
select * from #userData
; with CTE as
(
select count(customerId) count, customerId, employeeId from #userData
group by customerId, employeeid having count(customerid) >= 2
), cte2 as
(
Select employeeId from CTE group by Employeeid having count(employeeId) >= 3
)
select count, customerid, employeeid from CTE as a
inner join CTE2 as b on a.employeeId = b.employeeId
OUTPUT
count customerId employeeId
2 1 2
3 2 2
2 3 2
3 2 5
3 3 5
3 4 5
6 5 5
If You need only the EmployeeId, then just fire
Select employeeId from CTE2
output
Employeeid
2
5

Consecutive Count on Record Result

I'm working on a data structure with list of positive or negative result for each person.
Sample data (id is an identity):
id person result
1 1 0
2 1 1
3 1 1
4 2 1
5 2 0
6 1 1
7 1 0
8 2 0
9 2 0
10 2 0
With this I would like to count the maximum number of consecutive result = 1 for each person. The result in this sample would be
person max_count
1 3
2 1
I have tried using ROW_NUMBER() OVER (PARTITION BY) like this
SELECT person,
ROW_NUMBER() OVER (PARTITION BY person, result ORDER BY id) AS max_count
FROM TABLE
but it gives me an accumulative count instead of consecutive one.
What should I do to perform a consecutive count? Any hint would be appreciated. Thanks in advance
This looks like classic gaps-and-islands problem.
Examine intermediate results of each CTE in the query below to understand what is going on.
Sample data
I added person 3 with two sequences of positive results, so that we could find the longest sequence.
DECLARE #T TABLE (id int, person int, result int);
INSERT INTO #T (id, person, result) VALUES
(1 , 1, 0),
(2 , 1, 1),
(3 , 1, 1),
(4 , 2, 1),
(5 , 2, 0),
(6 , 1, 1),
(7 , 1, 0),
(8 , 2, 0),
(9 , 2, 0),
(10, 2, 0),
(11, 3, 0),
(12, 3, 1),
(13, 3, 1),
(14, 3, 1),
(15, 3, 1),
(16, 3, 0),
(17, 3, 1),
(18, 3, 1),
(19, 3, 0),
(20, 3, 0);
Query
WITH
CTE_RowNumbers
AS
(
SELECT
id, person, result
,ROW_NUMBER() OVER (PARTITION BY person ORDER BY ID) AS rn1
,ROW_NUMBER() OVER (PARTITION BY person, result ORDER BY ID) AS rn2
FROM #T
)
,CTE_Groups
AS
(
SELECT
id, person, result
,rn1-rn2 AS GroupNumber
FROM CTE_RowNumbers
)
,CTE_GroupSizes
AS
(
SELECT
person
,COUNT(*) AS GroupSize
FROM CTE_Groups
WHERE
result = 1
GROUP BY
person
,GroupNumber
)
SELECT
person
,MAX(GroupSize) AS max_count
FROM CTE_GroupSizes
GROUP BY person
ORDER BY person;
Result
+--------+-----------+
| person | max_count |
+--------+-----------+
| 1 | 3 |
| 2 | 1 |
| 3 | 4 |
+--------+-----------+
by using Case and SUM we can achieve the above result
DECLARE #T TABLE (id int, person int, result int);
INSERT INTO #T (id, person, result) VALUES
(1 , 1, 0),
(2 , 1, 1),
(3 , 1, 1),
(4 , 2, 1),
(5 , 2, 0),
(6 , 1, 1),
(7 , 1, 0),
(8 , 2, 0),
(9 , 2, 0),
(10, 2, 0)
select
person,
SUM(CASE WHEN RESULT = 1 then 1 else 0 END)
from #T
GROUP BY person

SQL count joins distinct by multiple columns

I have a problem where I need to JOIN on the same table multiple time and count the JOINS.
here is the database setup (SQL Fiddle
):
CREATE TABLE state
(
[t_id] int,
[true_id] int,
[false_id] int,
[msg] varchar(32)
);
INSERT INTO state
(t_id, true_id, false_id, msg)
VALUES
(5, 6, 7, 'CASE_1'),
(10, 11, 12, 'CASE_2'),
(20, 21, 22, 'CASE_N'),
(30, 31, 32, 'FOOOO');
CREATE TABLE step
(
[id] int,
[f_id] int,
[state_type] int,
[state_value] int
);
INSERT INTO step
(id,f_id,state_type, state_value)
VALUES
(1, 5, 5, 7),
(2, 5, 5, 7),
(3, 5, 5, 6),
(4, 5, 10, 12),
(5, 5, 10, 12),
(6, 5, 10, 11),
(7, 6, 10, 12),
(8, 6, 10, 12),
(9, 7, 20, 21),
(10, 7, 20, 21),
(11, 7, 30, 32),
(12,7, 30, 31);
here is my current query:
SELECT state.msg,
COUNT(state_true.true_id) AS Trues,
COUNT(state_false.false_id) AS Falses
FROM state
INNER JOIN step ON state.t_id = step.state_type
LEFT OUTER JOIN state AS state_true ON step.state_value = state_true.true_id
LEFT OUTER JOIN state AS state_false ON step.state_value = state_false.false_id
GROUP BY state.msg, step.f_id
And here what I get:
msg Trues Falses
CASE_1 1 2
CASE_2 1 2
CASE_2 0 2
CASE_N 2 0
FOOOO 1 1
And here what i need:
msg Trues Falses
CASE_1 1 0
CASE_2 1 1
CASE_N 1 0
FOOOO 1 0
For Explanation:
I need to count how many trues and fails are per state_type and f_id combination.
There are 6 entries with the f_id = 5 -> (1,2,3,4,5,6). If there is a entry with the same (f_id,state_type) combination, only the last one should be counted. So for f_id 5 the entries 1,2,4,5 should not be taken into the count, as they are overwritten by 3 and 6.
So after processing the first 6 entries there should be CASE_1 true => 1 false => 0 and CASE_2 true => 1 false => 0
__ EDIT __
TABLE step:
(1, 5, 5, 7), -- do not count
(2, 5, 5, 7), -- do not count
(3, 5, 5, 6), -- this is the last entry with
-- (f_id,state_type) => (5,5) combination.
-- it overwrites the 2 previous ones => count CASE_1 true
(4, 5, 10, 12), -- do not count
(5, 5, 10, 12), -- do not count
(6, 5, 10, 11), -- count CASE_2 true
(7, 6, 10, 12), -- do not count
(8, 6, 10, 12), -- count CASE_2 false
(9, 7, 20, 21), -- do not count
(10, 7, 20, 21), -- count CASE_N false
(11, 7, 30, 32), -- do not count
(12,7, 30, 31); -- count FOOOO true
I'm not sure I fully understand your intent, but maybe the query below might be what you want? Given your sample data it seems to produce the right output.
SELECT state.msg,
SUM(CASE WHEN true_id = state_value THEN 1 ELSE 0 END) AS Trues,
SUM(CASE WHEN false_id = state_value THEN 1 ELSE 0 END) AS Falses
FROM state
JOIN step ON state.t_id = step.state_type
JOIN (SELECT MAX(id) mid FROM step GROUP BY f_id, state_type) a ON a.mid = step.id
GROUP BY state.msg;
Please give it a try. If I misunderstood I'll remove the answer.

Find missing sequences by category

I have to identify missing records from the example below.
Category BatchNo TransactionNo
+++++++++++++++++++++++++++++++++
CAT1 1 1
CAT1 1 2
CAT1 2 3
CAT1 2 4
CAT1 2 5
CAT1 3 6
CAT1 3 7
CAT1 3 8
CAT1 5 12
CAT1 5 13
CAT1 5 14
CAT1 5 15
CAT1 7 18
CAT2 1 1
CAT2 1 2
CAT2 3 6
CAT2 3 7
CAT2 3 8
CAT2 3 9
CAT2 4 10
CAT2 4 11
CAT2 4 12
CAT2 6 14
I need a script that will identify missing records as below
Category BatchNo
+++++++++++++++++++
CAT1 4
CAT1 6
CAT2 2
CAT2 5
I do not need to know that CAT1 8 and CAT2 7 are not there as they potentially have not been inserted yet.
You can create temporary result set with all possible batch no up to max batch number for each category than select batch no which are not available.
create table TEMP(
Category varchar(10),
BatchNo int,
TransactionNo int
)
insert into TEMP values
('CAT1', 1, 1),
('CAT1', 1, 2),
('CAT1', 2, 3),
('CAT1', 2, 4),
('CAT1', 2, 5),
('CAT1', 3, 6),
('CAT1', 3, 7),
('CAT1', 3, 8),
('CAT1', 5, 9),
('CAT1', 7, 10),
('CAT2', 1, 1),
('CAT2', 1, 2),
('CAT2', 3, 3),
('CAT2', 4, 4),
('CAT2', 4, 5),
('CAT2', 4, 6),
('CAT2', 6, 7);
WITH BatchNo (BatchID,Category,MaxBatch) AS (
SELECT 1, Category, MAX(BatchNo) AS MaxBatch FROM TEMP GROUP BY Category
UNION ALL
SELECT BatchID + 1, Category, MaxBatch FROM BatchNo
WHERE BatchID < MaxBatch
)
SELECT
BatchNo.Category,
BatchNo.BatchID
FROM
BatchNo
WHERE
BatchID NOT IN (SELECT BatchNo FROM TEMP WHERE Category = BatchNo.Category)
ORDER BY
BatchNo.Category,
BatchNo.BatchID
DROP TABLE TEMP
This one uses a Tally Table. For reference: http://www.sqlservercentral.com/articles/T-SQL/62867/
SAMPLE DATA
create table MyTable(
Category varchar(10),
BatchNo int,
TransactionNo int
)
insert into MyTable values
('CAT1', 1, 1),
('CAT1', 1, 2),
('CAT1', 2, 3),
('CAT1', 2, 4),
('CAT1', 2, 5),
('CAT1', 3, 6),
('CAT1', 3, 7),
('CAT1', 3, 8),
('CAT1', 5, 12),
('CAT1', 5, 13),
('CAT1', 5, 14),
('CAT1', 5, 15),
('CAT1', 7, 18),
('CAT2', 1, 1),
('CAT2', 1, 2),
('CAT2', 3, 6),
('CAT2', 3, 7),
('CAT2', 3, 8),
('CAT2', 3, 9),
('CAT2', 4, 10),
('CAT2', 4, 11),
('CAT2', 4, 12),
('CAT2', 6, 14);
SOLUTION
with e1(n) as (
select 1 union all select 1 union all select 1 union all
select 1 union all select 1 union all select 1 union all
select 1 union all select 1 union all select 1 union all select 1
), --10e+1 or 10 rows
e2(n) as (select 1 from e1 a, e1 b), --10e+2 or 100 rows
e4(N) AS (SELECT 1 FROM E2 a, E2 b), --10E+4 or 10,000 rows
tally(n) as(
select
top (select top 1 BatchNo from MyTable order by BatchNo desc)
row_number() over(order by (select null))
from e4
)
select
c.Category,
t.n
from tally t
cross join(
select
Category,
max(BatchNo) as MaxBatchNo
from MyTable
group by Category
)c
left join MyTable m
on m.BatchNo = t.n
and m.Category = c.Category
where
m.Category is null
and t.n < c.MaxBatchNo
order by
c.Category,
t.n
It is better to create a projection table and use standard left join to find gaps:
declare #Sequencer table (
Id int primary key
);
insert into #Sequencer (Id)
select top (1000) row_number() over(order by (select null)) from master.dbo.spt_values;
select *
from #Sequencer s
inner join (
select Category, max(BatchNo) as [Size] from dbo.Table group by Category
) cat on cat.Size > s.Id
left join (
select distinct Category, BatchNo from dbo.Table
) t on t.Category = cat.Category and t.BatchNo = s.Id
where t.BatchNo is null;
Of course, in real life you might need more than 1000 rows, so adjust it accordingly.
WITH Numbers AS (
SELECT MAX(BatchNo) AS Number
FROM #MyTable
UNION ALL
SELECT Number - 1
FROM Numbers
WHERE Number > 1
)
,CategorySizes AS (
SELECT Category
,MIN(BatchNo) AS StartBatch
,MAX(BatchNo) AS EndBatch
FROM #MyTable
GROUP BY Category
)
,PossibleBatches AS (
SELECT Category
,Numbers.Number AS BatchNo
FROM CategorySizes
CROSS JOIN Numbers
WHERE Numbers.Number BETWEEN CategorySizes.StartBatch AND CategorySizes.EndBatch
)
,MissingBatches AS (
SELECT PossibleBatches.Category
,PossibleBatches.BatchNo
FROM PossibleBatches
LEFT JOIN #MyTable
ON #MyTable.Category = PossibleBatches.Category
AND #MyTable.BatchNo = PossibleBatches.BatchNo
WHERE #MyTable.BatchNo IS NULL
)
SELECT *
FROM MissingBatches
without use cycle or fetch you can use this one: (#Category is my eqvivalent of your table name). (Performance is perfect)
DECLARE #t TABLE (RN INT IDENTITY,Category VARCHAR(255), BatchNo INT)
INSERT INTO #t
SELECT DISTINCT Category, BatchNo
FROM #Category
SELECT a.Category,a.BatchNo+1 AS BatchNo
FROM #t a
CROSS APPLY (SELECT * FROM #t b
WHERE a.RN+1 = b.RN AND
a.Category = b.Category AND
a.BatchNo+1 != b.BatchNo) x
create table #cat(
Category varchar(10),
BatchNo int,
TransactionNo int
)
insert into #cat values
('CAT1', 1, 1),
('CAT1', 1, 2),
('CAT1', 2, 3),
('CAT1', 2, 4),
('CAT1', 2, 5),
('CAT1', 3, 6),
('CAT1', 3, 7),
('CAT1', 3, 8),
('CAT1', 5, 9),
('CAT1', 7, 10),
('CAT2', 1, 1),
('CAT2', 1, 2),
('CAT2', 3, 3),
('CAT2', 4, 4),
('CAT2', 4, 5),
('CAT2', 4, 6),
('CAT2', 6, 7);
SELECT DISTINCT C.Category, C.BatchNo + 1
FROM #cat c
OUTER APPLY
(
SELECT *
FROM #cat c1
WHERE C1.BatchNo = C.BatchNo + 1 AND C1.Category = C.Category
) C2
WHERE C2.BatchNo IS NULL
AND
C.BatchNo <> (SELECT MAX(BatchNo) FROM #cat C3 WHERE c3.Category = c.Category)