Can I improve this query for use in large tables? - sql

How can I improve this query for use in large tables....?
I use a table ('DataValues') to store a collection of values ('Value') for collections ('Visit_id') ie it records certain values for each visit.
I use a table ('MatchItems') to store dynamic match sets 'MatchSet' of values ('Value'), sets can contain any number of values. The table also has a IsNeg field to indicate if the match should require a value to be not present in the visit collection.
This allows me to dynamically match visits that conform to certain criteria such as
Must contain values A, B and C and NOT D OR C and B AND NOT A.
ie (Value = A and Value = B and Value = C and Value /= D)
or (Value = C and Value = B and Value /= A)
I have a query that delivers a reasonable solution fiddle:
CREATE TABLE DataValues (
id NUMBER(5) CONSTRAINT DataValues_pk PRIMARY KEY,
Visit_id Number(5) ,
Value varchar(5)
);
INSERT INTO DataValues VALUES (1, 1, 'M');
INSERT INTO DataValues VALUES (2, 1, 'I');
INSERT INTO DataValues VALUES (3, 1, 'C');
INSERT INTO DataValues VALUES (4, 1, 'K');
INSERT INTO DataValues VALUES (5, 1, 'E');
INSERT INTO DataValues VALUES (6, 1, 'Y');
INSERT INTO DataValues VALUES (7, 2, 'M');
INSERT INTO DataValues VALUES (8, 2, 'O');
INSERT INTO DataValues VALUES (9, 2, 'U');
INSERT INTO DataValues VALUES (10, 2, 'S');
INSERT INTO DataValues VALUES (11, 2, 'E');
INSERT INTO DataValues VALUES (12, 3, 'C');
INSERT INTO DataValues VALUES (13, 3, 'A');
INSERT INTO DataValues VALUES (14, 3, 'T');
INSERT INTO DataValues VALUES (15, 4, 'S');
INSERT INTO DataValues VALUES (16, 4, 'A');
INSERT INTO DataValues VALUES (17, 4, 'T');
INSERT INTO DataValues VALUES (18, 5, 'M');
INSERT INTO DataValues VALUES (19, 5, 'A');
INSERT INTO DataValues VALUES (20, 5, 'T');
CREATE TABLE MatchItems (
id NUMBER(5) CONSTRAINT MatchItems_pk PRIMARY KEY,
MatchSet Number(5),
Value VARCHAR(5),
IsNeg NUMBER(1) NOT NULL CHECK (IsNeg in (0,1))
);
INSERT INTO MatchItems VALUES (1, 1, 'M', 0);
INSERT INTO MatchItems VALUES (2, 1, 'I', 0);
INSERT INTO MatchItems VALUES (3, 1, 'C', 0);
INSERT INTO MatchItems VALUES (4, 1, 'K', 0);
INSERT INTO MatchItems VALUES (5, 1, 'E', 0);
INSERT INTO MatchItems VALUES (6, 1, 'Y', 0);
INSERT INTO MatchItems VALUES (7, 2, 'C', 0);
INSERT INTO MatchItems VALUES (8, 2, 'A', 0);
INSERT INTO MatchItems VALUES (9, 3, 'A', 0);
INSERT INTO MatchItems VALUES (10, 3, 'T', 0);
INSERT INTO MatchItems VALUES (11, 4, 'S', 1);
INSERT INTO MatchItems VALUES (12, 4, 'A', 0);
INSERT INTO MatchItems VALUES (13, 4, 'K', 1);
INSERT INTO MatchItems VALUES (14, 5, 'A', 0);
INSERT INTO MatchItems VALUES (15, 5, 'T', 0);
SELECT
MatchItems.MatchSet,
DataValues.Visit_id,
GpMatchItems.Count TgtCount,
Count(MatchItems.Id),
sum(MatchItems.IsNeg)
FROM DataValues
LEFT JOIN MatchItems ON MatchItems.Value = DataValues.Value
--AND MatchItems.MatchSet = 4
LEFT JOIN (SELECT
MatchItems.MatchSet,
count(*) Count
FROM MatchItems
WHERE
MatchItems.IsNeg = 0
GROUP BY
MatchItems.MatchSet) GpMatchItems ON GpMatchItems.MatchSet = MatchItems.MatchSet
HAVING
Count(MatchItems.Id) = GpMatchItems.Count
AND sum(MatchItems.IsNeg) = 0
GROUP BY
MatchItems.MatchSet,
DataValues.Visit_id,
GpMatchItems.Count
How can I improve the performance of this query where the DataValues table contains 100m records, and MatchItems may include a collection of 50 sets each of 2 - 20 values?

You can try this version using Analytic functions and see if it performs any better. This query removes the subquery GpMatchItems that you are joining with.
SELECT DISTINCT matchset,
visit_id,
tgtcount,
match_visit_count,
isneg_sum
FROM (SELECT MatchItems.MatchSet,
DataValues.Visit_id,
COUNT (DISTINCT CASE MatchItems.IsNeg WHEN 0 THEN MatchItems.id ELSE NULL END)
OVER (PARTITION BY MatchItems.MatchSet)
AS tgtcount,
COUNT (*) OVER (PARTITION BY MatchItems.MatchSet, DataValues.Visit_id)
AS match_visit_count,
SUM (MatchItems.IsNeg) OVER (PARTITION BY MatchItems.MatchSet, DataValues.Visit_id)
AS isneg_sum
FROM DataValues LEFT JOIN MatchItems ON MatchItems.VALUE = DataValues.VALUE)
WHERE tgtcount = match_visit_count AND isneg_sum = 0;

I have adjusted EJ's suggestion to include a LEFT JOIN to collect the tgtCount to identify the total number of good matches required in each MatchSet:
SELECT DISTINCT matchset,
visit_id,
tgtcount,
match_visit_count,
isneg_sum
GpMatchItems.count tgtCount
FROM
COUNT (*) OVER (PARTITION BY MatchItems.MatchSet, DataValues.Visit_id)
AS match_visit_count,
SUM (MatchItems.IsNeg) OVER (PARTITION BY MatchItems.MatchSet, DataValues.Visit_id)
AS isneg_sum
FROM DataValues
LEFT JOIN MatchItems ON MatchItems.VALUE = DataValues.VALUE)
LEFT JOIN ( SELECT
MatchItems.MatchSet,
count(*) Count
FROM MatchItems
WHERE MatchItems.IsNeg = 0
GROUP BY
MatchItems.MatchSet) GpMatchItems
ON GpMatchItems.MatchSet = MatchItems.MatchSet
)
WHERE
tgtcount = match_visit_count
AND isneg_sum = 0;

Related

Group table results with information from another

I'm trying to perform a query that returns an aggregation of values from the same table with information from others through a foreign key, but I can't. In the example below, I wanted to return the total sales by state on 2020-01-01 and 2021-01-01, showing the name of the state.
Tables script:
CREATE TABLE IF NOT EXISTS estado (
id SERIAL PRIMARY KEY,
estado VARCHAR(100)
)
CREATE TABLE IF NOT EXISTS municipio (
id SERIAL PRIMARY KEY,
estado integer REFERENCES estado(id),
municipio VARCHAR(100)
)
CREATE TABLE IF NOT EXISTS vendas (
id SERIAL PRIMARY KEY,
municipio integer REFERENCES municipio(id),
valor numeric,
data_venda date
)
INSERT INTO estado VALUES (1, 'PR');
INSERT INTO estado VALUES (2, 'SC');
INSERT INTO estado VALUES (3, 'RS');
INSERT INTO municipio VALUES (1, 1, 'Pelotas');
INSERT INTO municipio VALUES (2, 1, 'Caxias do Sul');
INSERT INTO municipio VALUES (3, 1, 'Porto Alegre');
INSERT INTO municipio VALUES (4, 2, 'Florianopolis');
INSERT INTO municipio VALUES (5, 2, 'Chapeco');
INSERT INTO municipio VALUES (6, 2, 'Itajai');
INSERT INTO municipio VALUES (7, 3, 'Curitiba');
INSERT INTO municipio VALUES (8, 3, 'Maringa');
INSERT INTO municipio VALUES (9, 3, 'Foz do Iguaçu');
INSERT INTO vendas VALUES (1, 6, 5, '2020-01-01');
INSERT INTO vendas VALUES (2, 5, 10, '2021-01-01');
INSERT INTO vendas VALUES (3, 5, 5, '2020-01-01');
INSERT INTO vendas VALUES (4, 4, 2, '2020-01-01');
INSERT INTO vendas VALUES (5, 3, 10, '2021-01-01');
INSERT INTO vendas VALUES (6, 3, 12, '2020-01-01');
INSERT INTO vendas VALUES (7, 3, 20, '2020-01-01');
INSERT INTO vendas VALUES (8, 2, 10, '2020-01-01');
INSERT INTO vendas VALUES (9, 1, 11, '2021-01-01');
INSERT INTO vendas VALUES (10, 9, 4, '2020-01-01');
My attempt (absurd values and the RS ones do not appear):
SELECT
e.estado, SUM(v.valor) as sum2021, SUM(v2.valor) as sum2020
FROM vendas v
CROSS JOIN vendas v2
INNER JOIN municipio m ON v.municipio = m.id
INNER JOIN estado e ON m.estado = e.id
WHERE v.data_venda = '2021-01-01'
AND v2.data_venda = '2020-01-01'
GROUP BY 1;
Translating some terms:
município = city
estado = state
vendas = sales
valor = value
data_venda = date of sale
You're cross joining vendas with itself (as v1 and v2), meaning that each row from it will be matched with each other row (i.e., a Cartesian product), which creates the unexpected results you're seeing.
The good news is that you don't need this join. You can use an aggregate function (sum in this case) on a subset of the rows from the query using the filter clause:
SELECT
e.estado,
SUM(v.valor) FILTER (WHERE data_venda = '2021-01-01') AS sum2021,
SUM(v.valor) FILTER (WHERE data_venda = '2020-01-01') AS sum2020
FROM vendas v
INNER JOIN municipio m ON v.municipio = m.id
INNER JOIN estado e ON m.estado = e.id
GROUP BY
e.estado;
SQLFiddle demo

summing by rows sql

I attempted to do it using the analytical function, but it appears that I did so improperly...
How can I receive the output from the table I've been given?
CREATE TABLE rides (
ride_id INT,
driver_id INT,
ride_in_kms INT,
ride_fare FLOAT,
ride_date DATE
);
INSERT INTO rides VALUES (1, 1, 3, 4.45, "2016-05-16");
INSERT INTO rides VALUES (2, 1, 4, 8.46, "2016-05-16");
INSERT INTO rides VALUES (3, 2, 6, 11.9, "2016-05-16");
INSERT INTO rides VALUES (4, 3, 3, 6.76, "2016-05-16");
INSERT INTO rides VALUES (5, 2, 6, 13.55, "2016-05-16");
INSERT INTO rides VALUES (6, 4, 3, 4.91, "2016-05-20");
INSERT INTO rides VALUES (7, 1, 7, 16.77, "2016-05-20");
INSERT INTO rides VALUES (8, 3, 9, 16.18, "2016-05-20");
INSERT INTO rides VALUES (9, 2, 3, 6.07, "2016-05-20");
INSERT INTO rides VALUES (10, 4, 4, 6.25, "2016-05-20");
Output result
Thanks in advance
The general gist is to use an expression within the sum() to operate on the correct rows:
select
driver_id,
sum(case when ride_date = "2016-05-16" then ride_in_kms else 0 end) `KMS_MAY_16`,
sum(case when ride_date = "2016-05-20" then ride_in_kms else 0 end) `KMS_MAY_20`
from
group by driver_id;
The particular syntax available, and how to express the column label depends on what database you are using.

Retrieving consecutive rows (and the counts) with the same values

I've got a table with almost 10 million views and would to run this query on the latest million or hundred thousand or so.
Here's a SQL fiddle with example data and input/output: http://sqlfiddle.com/#!9/340a41
Is this even possible?
CREATE TABLE object (`id` int, `name` varchar(7), `value` int);
INSERT INTO object (`id`, `name`, `value`)
VALUES
(1, 'a', 1),
(2, 'b', 2),
(3, 'c', 100),
(4, 'a', 1),
(5, 'b', 2),
(6, 'c', 200),
(7, 'a', 2),
(8, 'b', 2),
(9, 'c', 300),
(10, 'a', 2),
(11, 'b', 2),
(12, 'a', 2),
(13, 'b', 2),
(14, 'c', 400)
;
-- Want:
-- name, max(id), count(id)
-- 'a', 4, 2
-- 'b', 14, 5
-- 'a', 12, 3
If you want the latest and the id is implemented sequentially, then you can do this using limit or top. In SQL Server:
select top 100000 o.*
from object o
order by id desc;
In MySQL, you would use limit:
select o.*
from object o
order by id desc
limit 100000
select name, count(id) cnt, max(id) max_id, max(value) max_v
from
(select
top 1000000 -- MS SQL Server
id,name,value
from myTable
limit 1000000 --mySQL
order by id desc)
group by name
remove line which doesn't match your server.

SQL Query count

HI there I have this table,
Recipe = (idR, recipeTitle, prepText, cuisineType, mealType)
Ingredient = (idI, ingrDesc)
RecipIngr = (idR*, idI*)
and I'm trying to query a list for ingrDesc with a count of how many recipies that ingrDesc is in. I want to list only those ingrDesc that occur more than 10 times.
Here's what I have:
SELECT a.idI, a.recipeTitle
FROM Recipe a
INNER JOIN recpingr b
ON a.idr = b.idr
WHERE a.preptext = '>10'
Any help as I don't know how to carry on with this query
Use GROUP BY with HAVING:
SELECT i.idI, i.ingrDesc, COUNT(*)
FROM Ingredient i
INNER JOIN RecipIngr ri ON i.idI = ri.idI
GROUP BY i.idI, i.ingrDesc
HAVING COUNT(*) > 10
You need to use a group by clause and having. I have created a quick sample here but my sample data does not go up to 10 so I used any ingredient that was used more than once (> 1).
Here is the sample data:
create table dbo.recipe (
idR int not null,
recipeTitle varchar(100) not null,
prepText varchar(4000) null,
cuisineType varchar(100) null,
mealType varchar(100) null
)
go
insert into dbo.recipe values (1, 'Eggs and Bacon', 'Prep Text 1', 'American', 'Breakfast')
insert into dbo.recipe values (2, 'Turkey Sandwich', 'Prep Text 2', 'American', 'Lunch')
insert into dbo.recipe values (3, 'Roast Beef Sandwich', 'Prep Text 3', 'American', 'Lunch')
go
create table dbo.ingredient (
idI int not null,
ingrDesc varchar(200) not null
)
go
insert into dbo.ingredient values (1, 'Large Egg')
insert into dbo.ingredient values (2, 'Bacon');
insert into dbo.ingredient values (3, 'Butter');
insert into dbo.ingredient values (4, 'Sliced Turkey');
insert into dbo.ingredient values (5, 'Lettuce');
insert into dbo.ingredient values (6, 'Tomato');
insert into dbo.ingredient values (7, 'Onion');
insert into dbo.ingredient values (8, 'Bread');
insert into dbo.ingredient values (9, 'Mustard');
insert into dbo.ingredient values (10, 'Horseradish');
insert into dbo.ingredient values (11, 'Sliced Roast Beef');
go
create table dbo.recipingr(
idR int not null,
idI int not null
)
go
insert into dbo.recipingr values (1, 1);
insert into dbo.recipingr values (1, 2);
insert into dbo.recipingr values (2, 4);
insert into dbo.recipingr values (2, 5);
insert into dbo.recipingr values (2, 6);
insert into dbo.recipingr values (2, 7);
insert into dbo.recipingr values (2, 8);
insert into dbo.recipingr values (2, 9);
insert into dbo.recipingr values (3, 11);
insert into dbo.recipingr values (3, 10);
insert into dbo.recipingr values (3, 8);
insert into dbo.recipingr values (3, 6);
insert into dbo.recipingr values (3, 5);
go
Here is the query:
select
i.ingrDesc,
count(*) ingrCount
from
dbo.recipe r
inner join dbo.recipingr ri on ri.idR = r.idR
inner join dbo.ingredient i on i.idI = ri.idI
group by
i.ingrDesc
having
count(*) > 1

TSQL Cascade / Waterfall value from current row into the next [closed]

It's difficult to tell what is being asked here. This question is ambiguous, vague, incomplete, overly broad, or rhetorical and cannot be reasonably answered in its current form. For help clarifying this question so that it can be reopened, visit the help center.
Closed 10 years ago.
My earlier edits were a little muddled. Hopefully this clears it up ...
TL/DR -- just copy and execute the two script blocks and it will become apparent.
I have a question on cascading data. Essentially I am trying to move data down in a waterfall effect according to some predefined conditions (below). I've solved 15 of the 18 scenarios and I help with the remaining 3, scenario's with GID's 9, 10 and 18.
For a bit of perspective, in the system I'm working on data is continually imported into the system. The data is sparse, and I'm working to reconstitute a full set of data to complete the import process. I have little control over the shape of the data in the system, or that is provided to me:-/
Ultimately the question is: how do I satisfy the 5 cascading rules below, or alternately, how do I solve for test case #18 I've provided in the script below?
The Cascade Rules
In this simplified scenario the 'rules' for cascading are as follows:
Data will be cascaded only within the same group (GID)
A group of data will be ordered starting at 1 (Seq)
IsLive column will be either 1 or 0
If IsLive = 1 then move data down the rows until you encounter another IsLive = 1 or IsLive = 0 which has a non-null value
If IsLive = 0 then move data down the rows until you hit another IsLive = 0 with a value.
Note: My script is a simplified example, but in the full scenario there are N columns on which I need to cascade.
Solution Notes
If you run the SQL below you will see 3 columns, Input, Output - result of the CTE, Expected - the expected result and Result - Pass/Fail. I have included a script that both creates sample tables and illustrates test cases simply by executing.
The test cases script below has sample data
The test case script has a column I appended for the correct expected value. (Look for GID=18 in the INSERT script.)
I hope someone can help, if not I might have to resort to a SQL CLR SP solution. Also, I'm not tied to this solution, you may also completely discard my solution and come up with something new.
Test Case
DECLARE #Test TABLE (GID int, Seq int, IsLive bit,
Eff date,
Name varchar(50),
Expected varchar(50)) -- expected val should help debug!
INSERT INTO #Test VALUES (1, 1, 1, '01-08-2012', 'RTS', 'RTS')
INSERT INTO #Test VALUES (1, 2, 0, '01-09-2012', 'RTA', 'RTA')
INSERT INTO #Test VALUES (1, 3, 1, '01-10-2012', 'FSA', 'RTA')
INSERT INTO #Test VALUES (1, 4, 0, '01-11-2012', NULL, 'RTA')
INSERT INTO #Test VALUES (1, 5, 1, '01-12-2012', 'FSA', 'RTA')
INSERT INTO #Test VALUES (2, 1, 1, '01-08-2012', 'RTS', 'RTS')
INSERT INTO #Test VALUES (2, 2, 0, '01-09-2012', 'RTA', 'RTA')
INSERT INTO #Test VALUES (2, 3, 1, '01-10-2012', 'FSA', 'RTA')
INSERT INTO #Test VALUES (2, 4, 0, '01-11-2012', 'GSM', 'GSM')
INSERT INTO #Test VALUES (2, 5, 1, '01-12-2012', 'FSA', 'GSM')
INSERT INTO #Test VALUES (3, 1, 1, '01-01-2012', 'FSA', 'FSA')
INSERT INTO #Test VALUES (3, 2, 0, '01-02-2012', NULL, 'FSA')
INSERT INTO #Test VALUES (4, 1, 1, '01-01-2012', NULL, NULL)
INSERT INTO #Test VALUES (4, 2, 0, '01-02-2012', 'FSA', 'FSA')
INSERT INTO #Test VALUES (4, 3, 0, '01-03-2012', NULL, 'FSA')
INSERT INTO #Test VALUES (5, 1, 0, '01-01-2012', NULL, NULL)
INSERT INTO #Test VALUES (5, 2, 1, '01-02-2012', 'LSI', 'LSI')
INSERT INTO #Test VALUES (5, 3, 0, '01-03-2012', NULL, 'LSI')
INSERT INTO #Test VALUES (6, 1, 1, '01-01-2012', NULL, NULL)
INSERT INTO #Test VALUES (6, 2, 0, '01-02-2012', 'LSI', 'LSI')
INSERT INTO #Test VALUES (6, 3, 1, '01-03-2012', NULL, 'LSI')
INSERT INTO #Test VALUES (7, 1, 1, '01-01-2012', 'FSA', 'FSA')
INSERT INTO #Test VALUES (7, 2, 0, '01-02-2012', NULL, 'FSA')
INSERT INTO #Test VALUES (7, 3, 1, '01-03-2012', 'RTA', 'RTA')
INSERT INTO #Test VALUES (8, 1, 1, '01-01-2012', 'FSA', 'FSA')
INSERT INTO #Test VALUES (8, 2, 0, '01-02-2012', NULL, 'FSA')
INSERT INTO #Test VALUES (8, 3, 1, '01-03-2012', NULL, NULL)
INSERT INTO #Test VALUES (9, 1, 1, '01-01-2012', 'FSA', 'FSA')
INSERT INTO #Test VALUES (9, 2, 1, '01-02-2012', NULL, NULL)
INSERT INTO #Test VALUES (9, 3, 1, '01-03-2012', 'RTS', 'RTS')
INSERT INTO #Test VALUES (10, 1, 1, '01-01-2012', 'FSA','FSA')
INSERT INTO #Test VALUES (10, 2, 1, '01-02-2012', 'GSM','GSM')
INSERT INTO #Test VALUES (10, 3, 1, '01-03-2012', 'RTS','RTS')
INSERT INTO #Test VALUES (11, 1, 0, '01-01-2012', 'NOP','NOP')
INSERT INTO #Test VALUES (11, 2, 1, '01-02-2012', 'TAP','NOP')
INSERT INTO #Test VALUES (11, 3, 1, '01-03-2012', 'STG','NOP')
INSERT INTO #Test VALUES (12, 1, 1, '01-01-2012', 'RTS','RTS')
INSERT INTO #Test VALUES (12, 2, 0, '01-02-2012', 'RTM','RTM')
INSERT INTO #Test VALUES (12, 3, 1, '01-03-2012', 'LSA','RTM')
INSERT INTO #Test VALUES (12, 4, 1, '01-03-2012', 'LSA','RTM')
INSERT INTO #Test VALUES (12, 5, 1, '01-03-2012', 'GSM','RTM')
INSERT INTO #Test VALUES (13, 1, 1, '01-08-2012', 'BAR','BAR')
INSERT INTO #Test VALUES (13, 2, 0, '01-09-2012', NULL, 'BAR')
INSERT INTO #Test VALUES (13, 3, 1, '01-10-2012', 'TST','TST')
INSERT INTO #Test VALUES (14, 1, 1, '01-08-2012', 'BAR','BAR')
INSERT INTO #Test VALUES (14, 2, 0, '01-09-2012', 'GIP','GIP')
INSERT INTO #Test VALUES (14, 3, 1, '01-10-2012', 'TST','GIP')
INSERT INTO #Test VALUES (15, 1, 1, '01-01-2012', 'BAR','BAR')
INSERT INTO #Test VALUES (15, 2, 0, '01-02-2012', 'BAR','BAR')
INSERT INTO #Test VALUES (15, 3, 1, '01-02-2012', 'BAR','BAR')
INSERT INTO #Test VALUES (15, 4, 1, '01-02-2012', 'GYM','BAR')
INSERT INTO #Test VALUES (16, 1, 1, '01-02-2012', 'BAR','BAR')
INSERT INTO #Test VALUES (16, 2, 0, '01-03-2012', NULL, 'BAR')
INSERT INTO #Test VALUES (16, 3, 1, '01-03-2012', 'BAR','BAR')
INSERT INTO #Test VALUES (16, 4, 1, '01-03-2012', 'GYM','GYM')
INSERT INTO #Test VALUES (17, 1, 1, '01-02-2012', 'BAR', 'BAR')
INSERT INTO #Test VALUES (17, 2, 0, '01-03-2012', 'GIP', 'GIP')
INSERT INTO #Test VALUES (17, 3, 0, '01-03-2012', NULL, 'GIP')
INSERT INTO #Test VALUES (17, 4, 1, '01-03-2012', 'TST', 'GIP')
-- -------------------------------------------
-- Following is the GID=18 test case that fails
-- -------------------------------------------
INSERT INTO #Test VALUES (18, 1, 1, '01-02-2012', 'BAR', 'BAR')
INSERT INTO #Test VALUES (18, 2, 0, '01-03-2012', 'BAR', 'BAR')
INSERT INTO #Test VALUES (18, 3, 0, '01-03-2012', NULL, 'BAR')
INSERT INTO #Test VALUES (18, 4, 1, '01-03-2012', 'TST', 'BAR')
Solution
DECLARE #PrevNonLiveSeq int = NULL
;WITH CTE AS (
SELECT T.GID, T.SEQ, T.IsLive, Expected
, Name AS Name
, CASE WHEN T.IsLive = 0 THEN T.SEQ ELSE NULL END As PrevNonLiveSeq
, CASE WHEN T.IsLive = 1 THEN T.SEQ ELSE NULL END As PrevLiveSeq
, NULL AS PerNonLiveSeqCalc
, NULL AS PerLiveSeqCalc
, 0 PrevSeq
, CAST(NULL AS varchar(50)) PrevName
FROM #Test T
WHERE T.Seq = 1
UNION ALL
SELECT Curr.GID, Curr.SEQ, Curr.IsLive, Curr.Expected
,CASE WHEN Curr.IsLive = 0 THEN ISNULL(Curr.Name, Prev.Name)
ELSE CASE WHEN PrevNonLive.Name IS NULL THEN
CASE WHEN Prev.Name <> PrevLive.Name THEN Prev.Name ELSE Curr.Name END
ELSE Prev.Name END
END
,CASE WHEN Curr.IsLive = 0 THEN Curr.SEQ ELSE Prev.PrevNonLiveSeq END As PrevNonLiveSeq
,CASE WHEN Curr.IsLive = 1 THEN Curr.SEQ ELSE Prev.PrevLiveSeq END As PrevLiveSeq
, ISNULL(Prev.PrevNonLiveSeq, Curr.SEQ) AS PerNonLiveSeqCalc
, ISNULL(Prev.PrevLiveSeq, Curr.SEQ) AS PerLiveSeqCalc
, Prev.Seq PrevSeq, Prev.Name PrevName
FROM CTE Prev
JOIN #Test Curr ON Curr.GID = Prev.GID AND Curr.SEQ = Prev.SEQ+1
JOIN #Test PrevNonLive ON Prev.GID = PrevNonLive.GID AND PrevNonLive.SEQ = ISNULL(Prev.PrevNonLiveSeq, Curr.SEQ)
JOIN #Test PrevLive ON Prev.GID = PrevLive.GID AND PrevLive.SEQ = ISNULL(Prev.PrevLiveSeq, Curr.SEQ)
)
SELECT CTE.GID, CTE.Seq, T.IsLive
, T.Name Input, CTE.Name [Output]
, CASE WHEN CTE.Name = CTE.Expected OR (CTE.Name IS NULL AND CTE.Expected IS NULL) THEN 'Pass' ELSE 'FAIL' END AS Result
, CTE.Expected
FROM CTE
INNER JOIN #Test T on CTE.GID = T.GID AND CTE.Seq = T.Seq
ORDER BY CTE.GID, CTE.Seq
Results
For results please copy and run in SSMS
Thanks!
This should work and does not require the recursive CTE. You would just need to do the COALESCE for each of the actual fields you wanted to "cascade".
SELECT crrnt.*, COALESCE(cscd.Name, crrnt.Name) AS [Output]
FROM #Test crrnt
OUTER APPLY (
SELECT TOP 1 *
FROM #Test prir
WHERE prir.GID = crrnt.GID
AND prir.Seq < crrnt.Seq
AND (
(
crrnt.IsLive = 1
AND prir.IsLive = 0
AND prir.Name IS NOT NULL
)
OR (
crrnt.IsLive = 0
AND crrnt.Name IS NULL
AND (
(
prir.IsLive = 0
AND prir.Name IS NOT NULL
)
OR (
prir.IsLive = 1
AND NOT EXISTS(
SELECT *
FROM #Test confirm
WHERE confirm.GID = prir.GID
AND confirm.Seq < prir.Seq
AND confirm.IsLive = 0
AND confirm.Name IS NOT NULL
)
)
)
)
)
ORDER BY prir.Seq DESC
) cscd
Edit:
It is generally a good idea to test the performance of your queries so the following is just that. The test consists of:
1. Start with originally posted query and sample data
2. Change Temp Variable to Temp Table (query will end up hitting real User Table)
3. Create Clustered Index on Temp Table, being: GID, Seq.
4. Duplicate the data, but with higher GID values (turn 18 rows into 6,300,063 rows)
5. Ensure equal environment with DBCC FREEPROCCACAHE and DBCC DROPCLEANBUFFERS
6. Use STATISTICS IO and STATISTICS TIME
SET NOCOUNT ON
-- DROP TABLE #Test
IF (OBJECT_ID('tempdb.dbo.#Test') IS NULL)
BEGIN
CREATE TABLE #Test (GID INT NOT NULL, Seq INT NOT NULL, IsLive BIT NOT NULL,
Eff date,
Name varchar(50),
Expected varchar(50), -- expected val should help debug!
PRIMARY KEY(GID, Seq)
)
INSERT INTO #Test VALUES (1, 1, 1, '01-08-2012', 'RTS', 'RTS')
INSERT INTO #Test VALUES (1, 2, 0, '01-09-2012', 'RTA', 'RTA')
INSERT INTO #Test VALUES (1, 3, 1, '01-10-2012', 'FSA', 'RTA')
INSERT INTO #Test VALUES (1, 4, 0, '01-11-2012', NULL, 'RTA')
INSERT INTO #Test VALUES (1, 5, 1, '01-12-2012', 'FSA', 'RTA')
INSERT INTO #Test VALUES (2, 1, 1, '01-08-2012', 'RTS', 'RTS')
INSERT INTO #Test VALUES (2, 2, 0, '01-09-2012', 'RTA', 'RTA')
INSERT INTO #Test VALUES (2, 3, 1, '01-10-2012', 'FSA', 'RTA')
INSERT INTO #Test VALUES (2, 4, 0, '01-11-2012', 'GSM', 'GSM')
INSERT INTO #Test VALUES (2, 5, 1, '01-12-2012', 'FSA', 'GSM')
INSERT INTO #Test VALUES (3, 1, 1, '01-01-2012', 'FSA', 'FSA')
INSERT INTO #Test VALUES (3, 2, 0, '01-02-2012', NULL, 'FSA')
INSERT INTO #Test VALUES (4, 1, 1, '01-01-2012', NULL, NULL)
INSERT INTO #Test VALUES (4, 2, 0, '01-02-2012', 'FSA', 'FSA')
INSERT INTO #Test VALUES (4, 3, 0, '01-03-2012', NULL, 'FSA')
INSERT INTO #Test VALUES (5, 1, 0, '01-01-2012', NULL, NULL)
INSERT INTO #Test VALUES (5, 2, 1, '01-02-2012', 'LSI', 'LSI')
INSERT INTO #Test VALUES (5, 3, 0, '01-03-2012', NULL, 'LSI')
INSERT INTO #Test VALUES (6, 1, 1, '01-01-2012', NULL, NULL)
INSERT INTO #Test VALUES (6, 2, 0, '01-02-2012', 'LSI', 'LSI')
INSERT INTO #Test VALUES (6, 3, 1, '01-03-2012', NULL, 'LSI')
INSERT INTO #Test VALUES (7, 1, 1, '01-01-2012', 'FSA', 'FSA')
INSERT INTO #Test VALUES (7, 2, 0, '01-02-2012', NULL, 'FSA')
INSERT INTO #Test VALUES (7, 3, 1, '01-03-2012', 'RTA', 'RTA')
INSERT INTO #Test VALUES (8, 1, 1, '01-01-2012', 'FSA', 'FSA')
INSERT INTO #Test VALUES (8, 2, 0, '01-02-2012', NULL, 'FSA')
INSERT INTO #Test VALUES (8, 3, 1, '01-03-2012', NULL, NULL)
INSERT INTO #Test VALUES (9, 1, 1, '01-01-2012', 'FSA', 'FSA')
INSERT INTO #Test VALUES (9, 2, 1, '01-02-2012', NULL, NULL)
INSERT INTO #Test VALUES (9, 3, 1, '01-03-2012', 'RTS', 'RTS')
INSERT INTO #Test VALUES (10, 1, 1, '01-01-2012', 'FSA','FSA')
INSERT INTO #Test VALUES (10, 2, 1, '01-02-2012', 'GSM','GSM')
INSERT INTO #Test VALUES (10, 3, 1, '01-03-2012', 'RTS','RTS')
INSERT INTO #Test VALUES (11, 1, 0, '01-01-2012', 'NOP','NOP')
INSERT INTO #Test VALUES (11, 2, 1, '01-02-2012', 'TAP','NOP')
INSERT INTO #Test VALUES (11, 3, 1, '01-03-2012', 'STG','NOP')
INSERT INTO #Test VALUES (12, 1, 1, '01-01-2012', 'RTS','RTS')
INSERT INTO #Test VALUES (12, 2, 0, '01-02-2012', 'RTM','RTM')
INSERT INTO #Test VALUES (12, 3, 1, '01-03-2012', 'LSA','RTM')
INSERT INTO #Test VALUES (12, 4, 1, '01-03-2012', 'LSA','RTM')
INSERT INTO #Test VALUES (12, 5, 1, '01-03-2012', 'GSM','RTM')
INSERT INTO #Test VALUES (13, 1, 1, '01-08-2012', 'BAR','BAR')
INSERT INTO #Test VALUES (13, 2, 0, '01-09-2012', NULL, 'BAR')
INSERT INTO #Test VALUES (13, 3, 1, '01-10-2012', 'TST','TST')
INSERT INTO #Test VALUES (14, 1, 1, '01-08-2012', 'BAR','BAR')
INSERT INTO #Test VALUES (14, 2, 0, '01-09-2012', 'GIP','GIP')
INSERT INTO #Test VALUES (14, 3, 1, '01-10-2012', 'TST','GIP')
INSERT INTO #Test VALUES (15, 1, 1, '01-01-2012', 'BAR','BAR')
INSERT INTO #Test VALUES (15, 2, 0, '01-02-2012', 'BAR','BAR')
INSERT INTO #Test VALUES (15, 3, 1, '01-02-2012', 'BAR','BAR')
INSERT INTO #Test VALUES (15, 4, 1, '01-02-2012', 'GYM','BAR')
INSERT INTO #Test VALUES (16, 1, 1, '01-02-2012', 'BAR','BAR')
INSERT INTO #Test VALUES (16, 2, 0, '01-03-2012', NULL, 'BAR')
INSERT INTO #Test VALUES (16, 3, 1, '01-03-2012', 'BAR','BAR')
INSERT INTO #Test VALUES (16, 4, 1, '01-03-2012', 'GYM','GYM')
INSERT INTO #Test VALUES (17, 1, 1, '01-02-2012', 'BAR', 'BAR')
INSERT INTO #Test VALUES (17, 2, 0, '01-03-2012', 'GIP', 'GIP')
INSERT INTO #Test VALUES (17, 3, 0, '01-03-2012', NULL, 'GIP')
INSERT INTO #Test VALUES (17, 4, 1, '01-03-2012', 'TST', 'GIP')
-- -------------------------------------------
-- Following is the GID=18 test case that fails
-- -------------------------------------------
INSERT INTO #Test VALUES (18, 1, 1, '01-02-2012', 'BAR', 'BAR')
INSERT INTO #Test VALUES (18, 2, 0, '01-03-2012', 'BAR', 'BAR')
INSERT INTO #Test VALUES (18, 3, 0, '01-03-2012', NULL, 'BAR')
INSERT INTO #Test VALUES (18, 4, 1, '01-03-2012', 'TST', 'BAR')
CHECKPOINT
INSERT INTO #Test (GID, Seq, IsLive, Eff, Name, Expected)
SELECT tmp.GID + (multiplier.Num * 20) AS [GID], tmp.Seq, tmp.IsLive, tmp.Eff, tmp.Name, tmp.Expected
FROM #Test tmp
CROSS JOIN (
SELECT ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) AS [Num]
FROM master.sys.objects so1
CROSS JOIN master.sys.objects so2
CROSS JOIN master.sys.objects so3
) multiplier
WHERE multiplier.Num <= 100000
CHECKPOINT
SELECT COUNT(*) FROM #Test
ALTER INDEX ALL ON #Test REBUILD
-- SELECT TOP 1000 * FROM #Test ORDER BY GID, Seq
END /* IF (OBJECT_ID('tempdb.dbo.#Test') IS NULL) */
-----------------------------------------------------------------------------
DBCC FREEPROCCACHE WITH NO_INFOMSGS
DBCC DROPCLEANBUFFERS WITH NO_INFOMSGS
PRINT '-- Original solution (Recursive CTE):'
PRINT ''
SET STATISTICS IO ON
SET STATISTICS TIME ON
;WITH CTE AS (
SELECT T.GID, T.SEQ, T.IsLive, Expected
, Name AS Name
, CASE WHEN T.IsLive = 0 THEN T.SEQ ELSE NULL END As PrevNonLiveSeq
, CASE WHEN T.IsLive = 1 THEN T.SEQ ELSE NULL END As PrevLiveSeq
, NULL AS PerNonLiveSeqCalc
, NULL AS PerLiveSeqCalc
, 0 PrevSeq
, CAST(NULL AS varchar(50)) PrevName
FROM #Test T
WHERE T.Seq = 1
UNION ALL
SELECT Curr.GID, Curr.SEQ, Curr.IsLive, Curr.Expected
,CASE WHEN Curr.IsLive = 0 THEN ISNULL(Curr.Name, Prev.Name)
ELSE CASE WHEN PrevNonLive.Name IS NULL THEN
CASE WHEN Prev.Name <> PrevLive.Name THEN Prev.Name ELSE Curr.Name END
ELSE Prev.Name END
END
,CASE WHEN Curr.IsLive = 0 THEN Curr.SEQ ELSE Prev.PrevNonLiveSeq END As PrevNonLiveSeq
,CASE WHEN Curr.IsLive = 1 THEN Curr.SEQ ELSE Prev.PrevLiveSeq END As PrevLiveSeq
, ISNULL(Prev.PrevNonLiveSeq, Curr.SEQ) AS PerNonLiveSeqCalc
, ISNULL(Prev.PrevLiveSeq, Curr.SEQ) AS PerLiveSeqCalc
, Prev.Seq PrevSeq, Prev.Name PrevName
FROM CTE Prev
JOIN #Test Curr ON Curr.GID = Prev.GID AND Curr.SEQ = Prev.SEQ+1
JOIN #Test PrevNonLive ON Prev.GID = PrevNonLive.GID AND PrevNonLive.SEQ = ISNULL(Prev.PrevNonLiveSeq, Curr.SEQ)
JOIN #Test PrevLive ON Prev.GID = PrevLive.GID AND PrevLive.SEQ = ISNULL(Prev.PrevLiveSeq, Curr.SEQ)
)
SELECT CTE.GID, CTE.Seq, T.IsLive
, T.Name Input, CTE.Name [Output]
, CASE WHEN CTE.Name = CTE.Expected OR (CTE.Name IS NULL AND CTE.Expected IS NULL) THEN 'Pass' ELSE 'FAIL' END AS Result
, CTE.Expected
FROM CTE
INNER JOIN #Test T on CTE.GID = T.GID AND CTE.Seq = T.Seq
ORDER BY CTE.GID, CTE.Seq
SET STATISTICS TIME OFF
SET STATISTICS IO OFF
PRINT '=================================================='
------------------------------------------------------
DBCC FREEPROCCACHE WITH NO_INFOMSGS
DBCC DROPCLEANBUFFERS WITH NO_INFOMSGS
PRINT '-- Proposed solution (OUTER APPLY):'
PRINT ''
SET STATISTICS IO ON
SET STATISTICS TIME ON
SELECT crrnt.GID, crrnt.Seq, crrnt.IsLive,
COALESCE(cscd.Name, crrnt.Name) AS [Output],
CASE
WHEN COALESCE(COALESCE(cscd.Name, crrnt.Name), '~~~') = COALESCE(crrnt.Expected, '~~~') THEN 'Pass'
ELSE 'FAIL'
END AS [Result],
crrnt.Expected
FROM #Test crrnt
OUTER APPLY (
SELECT TOP 1 *
FROM #Test prir
WHERE prir.GID = crrnt.GID
AND prir.Seq < crrnt.Seq
AND (
(
crrnt.IsLive = 1
AND prir.IsLive = 0
AND prir.Name IS NOT NULL
)
OR (
crrnt.IsLive = 0
AND crrnt.Name IS NULL
AND (
(
prir.IsLive = 0
AND prir.Name IS NOT NULL
)
OR (
prir.IsLive = 1
AND NOT EXISTS(
SELECT *
FROM #Test confirm
WHERE confirm.GID = prir.GID
AND confirm.Seq < prir.Seq
AND confirm.IsLive = 0
AND confirm.Name IS NOT NULL
)
)
)
)
)
ORDER BY prir.Seq DESC
) cscd
SET STATISTICS TIME OFF
SET STATISTICS IO OFF
-----------------------------------
My execution of the above test shows:
Original Query: CPU time = 173031 ms, elapsed time = 252708 ms, logical reads = 97,538,739
Proposed Query = CPU time = 49125 ms, elapsed time = 74003 ms, logical reads = 17,747,775
Hence, the original query is about 3.5 times slower for both CPU and elapsed time, and about 5 times more logical reads than my proposed query. Be careful with Recursive CTEs ;-).