Query to reflect actual significant change in data - sql

Given a table with employee statuses and effective dates, how can I retrieve just the data that reflects a change in status?
For example, given the following structure:
DECLARE #STATUSES TABLE(
EMPLOYEE_ID INT NOT NULL,
EFFECTIVE_DATE DATE NOT NULL,
STATUS_CODE CHAR(1) NOT NULL
)
INSERT #STATUSES VALUES (1, '2012-01-01', 'A')
INSERT #STATUSES VALUES (1, '2012-02-28', 'A')
INSERT #STATUSES VALUES (1, '2012-03-01', 'T')
INSERT #STATUSES VALUES (2, '2012-01-01', 'A')
INSERT #STATUSES VALUES (2, '2012-02-14', 'A')
INSERT #STATUSES VALUES (2, '2012-03-10', 'A')
INSERT #STATUSES VALUES (3, '2012-02-01', 'A')
INSERT #STATUSES VALUES (3, '2012-03-17', 'A')
INSERT #STATUSES VALUES (3, '2012-03-18', 'T')
INSERT #STATUSES VALUES (3, '2012-04-01', 'A')
INSERT #STATUSES VALUES (4, '2012-03-01', 'A')
What query can be used to result in the following?
EMPLOYEE_ID EFFECTIVE_DATE STATUS_CODE
1 2012-01-01 A
1 2012-03-01 T
2 2012-01-01 A
3 2012-02-01 A
3 2012-03-18 T
3 2012-04-01 A
4 2012-03-01 A
In other words, I want to leave out those records that have the same employee id and status code as the one before it, if one exists with an earlier effective date. Notice that employee 1 is listed only twice because there were only two actual changes in status--the one on 2012-02-28 is inconsequential since the status didn't change from the earlier date. Also notice that employee 2 is listed just once since his status never changed despite there being three records. Only the earliest date is shown for each change.

With some further experimenting, it looks like this will do what I want.
;WITH cte
AS (SELECT ROW_NUMBER() OVER (PARTITION BY EMPLOYEE_ID ORDER BY EFFECTIVE_DATE) AS rownum
,EMPLOYEE_ID
,EFFECTIVE_DATE
,STATUS_CODE
FROM #STATUSES)
SELECT t2.EMPLOYEE_ID
,t2.EFFECTIVE_DATE
,t2.STATUS_CODE
FROM cte t2
LEFT JOIN cte t1
ON t2.EMPLOYEE_ID = t1.EMPLOYEE_ID
AND t2.STATUS_CODE = t1.STATUS_CODE
AND t2.rownum = t1.rownum + 1
WHERE t1.EMPLOYEE_ID IS NULL

You could use a CURSOR
You'd need two sets of variables: #PreviousRecord and #CurrentRecord
Declare the cursor for table sorted by employeeid and date
Fetch the first record from the cursor into the #PreviousRecord variables - depending on your requirement register this as a significant change or not (write the record to a temp table)
Then set up a loop that:
Fetches the next record into the #CurrentRecord variables
Compares it with the previous record and if it matches your requirement for a significant change write it to the temp table
Move the #CurrentRecord values into the #PreviousRecord variables
I'd be interested to know if the CTE method was more efficient

SELECT
EMPLOYEE_ID, MIN(EFFECTIVE_DATE) AS EFFECTIVE_DATE, STATUS_CODE
FROM
(
SELECT
T1.EMPLOYEE_ID, T1.EFFECTIVE_DATE, T1.STATUS_CODE,
MAX(T2.EFFECTIVE_DATE) AS MOST_RECENT_PREVIOUS_STATUS_DATE
FROM
#STATUSES T1
LEFT JOIN
#STATUSES T2
ON
T1.EMPLOYEE_ID = T2.EMPLOYEE_ID
AND
T1.EFFECTIVE_DATE > T2.EFFECTIVE_DATE
AND
T1.STATUS_CODE <> T2.STATUS_CODE
GROUP BY
T1.EMPLOYEE_ID, T1.EFFECTIVE_DATE, T1.STATUS_CODE
) SubQuery
GROUP BY
EMPLOYEE_ID, STATUS_CODE, MOST_RECENT_PREVIOUS_STATUS_DATE

Related

Constructing single select statement that returns order depends on the value of a column in SQL Server

Table1
Id bigint primary key identity(1,1)
Status nvarchar(20)
Insert dummy data
Insert into Table1 values ('Open') --1
Insert into Table1 values ('Open') --2
Insert into Table1 values ('Grabbed') --3
Insert into Table1 values ('Closed') --4
Insert into Table1 values ('Closed') --5
Insert into Table1 values ('Open') --6
How would I construct a single select statement which orders the data where records with 'Grabbed' status is first, followed by 'Closed', followed by 'Open' in SQL Server
Output:
Id Status
3 Grabbed
4 Closed
5 Closed
1 Open
2 Open
6 Open
I think you need something like this:
select *
from yourTable
order by case when Status = 'Grabbed' then 1
when Status = 'Closed' then 2
when Status = 'Open' then 3
else 4 end
, Id;
[SQL Fiddle Demo]
Another way is to using CTE like this:
;with cte as (
select 'Grabbed' [Status], 1 [order]
union all select 'Closed', 2
union all select 'Open', 3
)
select t.*
from yourTable t
left join cte
on t.[Status] = cte.[Status]
order by cte.[order], Id;
[SQL Fiddle Demo]
This could be done much better with a properly normalized design:
Do not store your Status as a textual content. Just imagine a typo (a row with Grabed)...
Further more a lookup table allows you to add side data, e.g. a sort order.
CREATE TABLE StatusLookUp(StatusID INT IDENTITY PRIMARY KEY /*you should name your constraints!*/
,StatusName VARCHAR(100) NOT NULL
,SortRank INT NOT NULL)
INSERT INTO StatusLookUp VALUES
('Open',99) --ID=1
,('Closed',50)--ID=2
,('Grabbed',10)--ID=3
CREATE TABLE Table1(Id bigint primary key identity(1,1) /*you should name your constraints!*/
,StatusID INT FOREIGN KEY REFERENCES StatusLookUp(StatusID));
Insert into Table1 values (1) --1
Insert into Table1 values (1) --2
Insert into Table1 values (3) --3
Insert into Table1 values (2) --4
Insert into Table1 values (2) --5
Insert into Table1 values (1) --6
SELECT *
FROM Table1 AS t1
INNER JOIN StatusLookUp AS s ON t1.StatusID=s.StatusID
ORDER BY s.SortRank;
I find that the simplest method uses a string:
order by charindex(status, 'Grabbed,Closed,Open')
or:
order by charindex(',' + status + ',', ',Grabbed,Closed,Open,')
If you are going to put values in the query, I think the easiest way uses values():
select t1.*
from t1 left join
(values ('Grabbed', 1), ('Closed', 2), ('Open', 3)) v(status, priority)
on t1.status = v.status
order by coalesce(v.priority, 4);
Finally. This need suggests that you should have a reference table for statuses. Rather than putting the string name in other tables, put an id. The reference table can have the priority as well as other information.
Try this:
select Id,status from tablename where status='Grabbed'
union
select Id,status from tablename where status='Closed'
union
select Id,status from tablename where status='Open'

Count number of records returned by temp table - SQL Server

My script is as below
CREATE TABLE #t (Id int, Name varchar(10))
INSERT INTO #t VALUES (1, 'A')
INSERT INTO #t VALUES (1, 'B')
INSERT INTO #t VALUES (1, 'C')
INSERT INTO #t VALUES (1, 'D')
INSERT INTO #t VALUES (2, 'E')
SELECT COUNT(0)FROM (SELECT COUNT(0) FROM #t GROUP BY Id) a
but I am getting an error
Msg 8155, Level 16, State 2, Line 5
No column name was specified for column 1 of 'A'.
When you use a subquery, all the columns need to given names:
SELECT COUNT(0)
FROM (SELECT COUNT(0) as cnt FROM #t GROUP BY Id
) a;
However, a simpler way to write this is:
SELECT COUNT(DISTINCT id)
FROM #t;
Actually, this isn't exactly the same. Your version will count NULL values but this does not. The exact equivalent is:
SELECT COUNT(DISTINCT id) + MAX(CASE WHEN id IS NULL THEN 1 ELSE 0 END)
FROM #t;

Returning a set of the most recent rows from a table

I'm trying to retrieve the latest set of rows from a source table containing a foreign key, a date and other fields present. A sample set of data could be:
create table #tmp (primaryId int, foreignKeyId int, startDate datetime,
otherfield varchar(50))
insert into #tmp values (1, 1, '1 jan 2010', 'test 1')
insert into #tmp values (2, 1, '1 jan 2011', 'test 2')
insert into #tmp values (3, 2, '1 jan 2013', 'test 3')
insert into #tmp values (4, 2, '1 jan 2012', 'test 4')
The form of data that I'm hoping to retrieve is:
foreignKeyId maxStartDate otherfield
------------ ----------------------- -------------------------------------------
1 2011-01-01 00:00:00.000 test 2
2 2013-01-01 00:00:00.000 test 3
That is, just one row per foreignKeyId showing the latest start date and associated other fields - the primaryId is irrelevant.
I've managed to come up with:
select t.foreignKeyId, t.startDate, t.otherField from #tmp t
inner join (
select foreignKeyId, max(startDate) as maxStartDate
from #tmp
group by foreignKeyId
) s
on t.foreignKeyId = s.foreignKeyId and s.maxStartDate = t.startDate
but (a) this uses inner queries, which I suspect may lead to performance issues, and (b) it gives repeated rows if two rows in the original table have the same foreignKeyId and startDate.
Is there a query that will return just the first match for each foreign key and start date?
Depending on your sql server version, try the following:
select *
from (
select *, rnum = ROW_NUMBER() over (
partition by #tmp.foreignKeyId
order by #tmp.startDate desc)
from #tmp
) t
where t.rnum = 1
If you wanted to fix your attempt as opposed to re-engineering it then
select t.foreignKeyId, t.startDate, t.otherField from #tmp t
inner join (
select foreignKeyId, max(startDate) as maxStartDate, max(PrimaryId) as Latest
from #tmp
group by foreignKeyId
) s
on t.primaryId = s.latest
would have done the job, assuming PrimaryID increases over time.
Qualms about inner query would have been laid to rest as well assuming some indexes.

project a sparse result at some level

I don't really know what to call this but it's not that hard to explain
Basically what I have is a result like this
Similarity ColumnA ColumnB ColumnC
1 SomeValue NULL SomeValue
2 NULL SomeB NULL
3 SomeValue NULL SomeC
4 SomeA NULL NULL
This result is created by matching a set of strings against another table. Each string also contains some values for these ColumnA..C which are the values I wan't to aggregate in some way.
Something like min/max works very well but I can't figure out how to get it to account for the highest similarity not just the min/max value. I don't really want the min/max, I want the first non-null value with the highest similarity.
Ideally the result would look like this
ColumnA ColumnB ColumnC
SomeA SomeB SomeC
I'd like be able to efficiently join in the temporary result to compute the rest and I've been exploring different options. Something which I've been considering is creating a SQL Server CLR aggregate the yields the "first" non-null value but I'm unsure if there's even such a thing as a first or last when running an aggregate on a result.
Okay, so I figured it out, I originally had trouble with the UPDATE FROM and JOIN not playing well together. I was counting on that the UPDATE would just occur multiple times and that would give me the correct results, however, there's no such guarantee from SQL Server (it's actually undefined behavior and alltough it appeared to work we'll have none of that) but since you can run UPDATE against a CTE I combined that with the OUTER APPLY to select the exactly 1 row to complement a missing value if possible.
Here's the whole thing with test data as well.
DECLARE #cost TABLE (
make nvarchar(100) not null,
model nvarchar(100),
a numeric(18,2),
b numeric(18,2)
);
INSERT #cost VALUES ('a%', null, 100, 2);
INSERT #cost VALUES ('a%', 'a%', 149, null);
INSERT #cost VALUES ('a%', 'ab', 349, null);
INSERT #cost VALUES ('b', null, null, 2.5);
INSERT #cost VALUES ('b', 'b%', 249, null);
INSERT #cost VALUES ('b', 'b', null, 3);
DECLARE #unit TABLE (
id int,
make nvarchar(100) not null,
model nvarchar(100)
);
INSERT #unit VALUES (1, 'a', null);
INSERT #unit VALUES (2, 'a', 'a');
INSERT #unit VALUES (3, 'a', 'ab');
INSERT #unit VALUES (4, 'b', null);
INSERT #unit VALUES (5, 'b', 'b');
DECLARE #tmp TABLE (
id int,
specificity int,
a numeric(18,2),
b numeric(18,2),
primary key(id, specificity)
);
INSERT #tmp
OUTPUT inserted.* --FOR DEBUGGING
SELECT
unit.id
, ROW_NUMBER() OVER (
PARTITION BY unit.id
ORDER BY cost.make DESC, cost.model DESC
) AS specificity
, cost.a
, cost.b
FROM #unit unit
INNER JOIN #cost cost ON unit.make LIKE cost.make
AND (cost.model IS NULL OR unit.model LIKE cost.model)
;
--fix the holes
WITH tmp AS (
SELECT *
FROM #tmp
WHERE specificity = 1
AND (a IS NULL OR b IS NULL) --where necessary
)
UPDATE tmp
SET
tmp.a = COALESCE(tmp.a, a.a)
, tmp.b = COALESCE(tmp.b, b.b)
OUTPUT inserted.* --FOR DEBUGGING
FROM tmp
OUTER APPLY (
SELECT TOP 1 a
FROM #tmp a
WHERE a.id = tmp.id
AND a.specificity > 1
AND a.a IS NOT NULL
ORDER BY a.specificity
) a
OUTER APPLY (
SELECT TOP 1 b
FROM #tmp b
WHERE b.id = tmp.id
AND b.specificity > 1
AND b.b IS NOT NULL
ORDER BY b.specificity
) b
;

Select Records that match ALL groups in a many to many join table

I have 2 tables: sets and groups. Both are joined using a 3rd table set_has_groups.
I would like to get sets that have ALL groups that I specify
One way of doing it would be
SELECT column1, column2 FROM sets WHERE
id IN(SELECT set_id FROM set_has_group WHERE group_id = 1)
AND id IN(SELECT set_id FROM set_has_group WHERE group_id = 2)
AND id IN(SELECT set_id FROM set_has_group WHERE group_id = 3)
obviously this is not the most beautiful solution
I've also tried this:
SELECT column1, column2 FROM sets WHERE
id IN(SELECT set_id FROM set_has_group WHERE group_id IN(1,2,3) GROUP BY group_id
HAVING COUNT(*) = 3
This looks prettier but the problem is that it takes forever to execute.
While the first query runs in like 200ms the 2nd one takes more than 1 minute.
Any idea why that is?
===UPDATE:
I've played with this some more and I modified the 2nd query like this
SELECT columns FROM `set` WHERE id IN(
select set_id FROM
(
SELECT set_id FROM set_has_group
WHERE group_id IN(1,2,3)
GROUP BY set_id HAVING COUNT(*) = 3
) as temp
)
that is really fast
It's the same as the 2nd query before just that I wrap it in another temporary table
Pretty strange
I am suspecting a small mistyping in the second query.
Really, I am not sure. Probably, the second query is executed via full table scan. At the same time the first one "IN" is really transformed into "EXISTS". So, you can try to use "exists". For example:
...
where 3 = (select count(*) from set_has_group
where group_id in (1, 2, 3) and set_id = id
group by set_id)
Assuming SQL Server, here is a working example with a JOIN that should work better than the IN clauses you are using as long as you have your primary and foreign keys set correctly. I have built joined 5 sets to 3 groups, but set 4 and 5 are not a part of group 3 and will not show in the answer. However, this query is not scalable (for ex. find in group 4, 5, 7, 8 and 13 will require code modifications unless you parse input params into a table variable)
set nocount on
declare #sets table
(
Id INT Identity (1, 1),
Column1 VarChar (50),
Column2 VarChar (50)
)
declare #Set_Has_Group table
(
Set_Id Int,
Group_Id Int
)
insert into #sets values (newid(), newid())
insert into #sets values (newid(), newid())
insert into #sets values (newid(), newid())
insert into #sets values (newid(), newid())
insert into #sets values (newid(), newid())
update #sets set column1 = 'Column1 at Row ' + Convert (varchar, id)
update #sets set column2 = 'Column2 at Row ' + Convert (varchar, id)
insert into #Set_Has_Group values (1, 1)
insert into #Set_Has_Group values (1, 2)
insert into #Set_Has_Group values (1, 3)
insert into #Set_Has_Group values (2, 1)
insert into #Set_Has_Group values (2, 2)
insert into #Set_Has_Group values (2, 3)
insert into #Set_Has_Group values (3, 1)
insert into #Set_Has_Group values (3, 2)
insert into #Set_Has_Group values (3, 3)
insert into #Set_Has_Group values (4, 1)
insert into #Set_Has_Group values (4, 2)
insert into #Set_Has_Group values (5, 1)
insert into #Set_Has_Group values (5, 2)
/* your query with IN */
SELECT column1, column2 FROM #sets WHERE
id IN(SELECT set_id FROM #set_has_group WHERE group_id = 1)
AND id IN(SELECT set_id FROM #set_has_group WHERE group_id = 2)
AND id IN(SELECT set_id FROM #set_has_group WHERE group_id = 3)
/* my query with JOIN */
SELECT * -- Column1, Column2
FROM #sets sets
WHERE 3 = (
SELECT Count (1)
FROM #Set_Has_Group Set_Has_Group
WHERE 1=1
AND sets.Id = Set_Has_Group.Set_Id
AND Set_Has_Group.Group_ID IN (1, 2, 3)
Group by Set_Id
)
Here's a solution that uses a non-correlated subquery and no GROUP BY:
SELECT column1, column2
FROM sets
WHERE id IN (
SELECT g1.set_id FROM set_has_group g1
JOIN set_has_group g2 ON (g1.set_id = g3.set_id)
JOIN set_has_group g3 ON (g1.set_id = g3.set_id)
WHERE g1.group_id = 1 AND g2.group_id = 2 AND g3.group_id = 3);