SQL: Get last referring and post referring page during a signup process - sql

I'm trying to write an efficient SQL query to select 'before' and 'after' pages for the signup process. I have a solution using for loops which doesn't scale and am hoping to get a SQL native solution.
For a single clientId, I would want to get the latest pages before sign up and after signup (only 1 from each side of the join process).
The join process ALWAYS has /join/complete
Input:
clientId time path
1 0 /page1
1 10 /page2
1 20 /join/<random_token_id>
1 30 /join/<random_token_id>/step2
1 40 /join/complete
1 50 /page2
2 0 /page3
2 10 /join/complete
Output
ClientId Before After
1 /page2 /page2
2 /page3 null
I would be grateful if there is an easy solution in SQL. If it's complex, just leave it out. I will leave the code running overnight.

#standardSQL
WITH lineup AS (
SELECT clientId, time, path,
ROW_NUMBER() OVER(PARTITION BY clientId ORDER BY time) pos
FROM `project.dataset.table`
), start AS (
SELECT row.clientId, row.pos FROM (
SELECT ARRAY_AGG(t ORDER BY pos LIMIT 1)[OFFSET(0)] row
FROM lineup t WHERE STARTS_WITH(path, '/join/')
GROUP BY clientId)
), complete AS (
SELECT clientId, pos FROM lineup WHERE path = '/join/complete'
), before AS (
SELECT lineup.clientId, path FROM lineup JOIN start
ON lineup.clientId = start.clientId AND lineup.pos = start.pos - 1
), after AS (
SELECT lineup.clientId, path FROM lineup JOIN complete
ON lineup.clientId = complete.clientId AND lineup.pos = complete.pos + 1
)
SELECT clientId, before.path AS before, after.path AS after
FROM before FULL OUTER JOIN after USING (clientId)
You can test / play with above using dummy data from your question as below
#standardSQL
WITH `project.dataset.table` AS (
SELECT 1 clientId, 0 time, '/page1' path UNION ALL
SELECT 1, 10, '/page2' UNION ALL
SELECT 1, 20, '/join/<random_token_id>' UNION ALL
SELECT 1, 30, '/join/<random_token_id>/step2' UNION ALL
SELECT 1, 40, '/join/complete' UNION ALL
SELECT 1, 50, '/page2' UNION ALL
SELECT 2, 0, '/page3' UNION ALL
SELECT 2, 10, '/join/complete' UNION ALL
SELECT 3, 0, '/join/complete' UNION ALL
SELECT 3, 10, '/page4'
), lineup AS (
SELECT clientId, time, path,
ROW_NUMBER() OVER(PARTITION BY clientId ORDER BY time) pos
FROM `project.dataset.table`
), start AS (
SELECT row.clientId, row.pos FROM (
SELECT ARRAY_AGG(t ORDER BY pos LIMIT 1)[OFFSET(0)] row
FROM lineup t WHERE STARTS_WITH(path, '/join/')
GROUP BY clientId)
), complete AS (
SELECT clientId, pos FROM lineup WHERE path = '/join/complete'
), before AS (
SELECT lineup.clientId, path FROM lineup JOIN start
ON lineup.clientId = start.clientId AND lineup.pos = start.pos - 1
), after AS (
SELECT lineup.clientId, path FROM lineup JOIN complete
ON lineup.clientId = complete.clientId AND lineup.pos = complete.pos + 1
)
SELECT clientId, before.path AS before, after.path AS after
FROM before FULL OUTER JOIN after USING (clientId)
with result as
Row clientId before after
1 1 /page2 /page2
2 2 /page3 null
3 3 null /page4

Related

how to delete repeated values in UNION query using count

Hello I have been trying to delete a repeated value on the following UNION query with the following results (image). How can I filter out the value LW_ID=8232 with AANTALLN =0. I need to find a way taht if in the first query AANTALLN >0 is found, then on the second part of the union query not insert it again. Thanks "
With LESEENHEIDLOOPBAAN as (
SELECT
LE_AGENDA_FK,
LE_CODE,
LE_ID,
LE_KLAS_FK,
LE_KLASPARTITIE_FK,
LE_OMSCHRIJVING,
LE_VERANDERDDOOR,
LE_VERANDERDOP,
Count(LH_ID) As AantalLln
FROM
LESEENHEID
INNER JOIN LOOPBAANLESEENHEID on (LH_LESEENHEID_FK = LE_ID)
INNER JOIN LOOPBAAN ON (LH_LOOPBAAN_FK = LB_ID)
WHERE
(
'2022/09/28' BETWEEN LB_VAN
AND LB_TOT
)
AND (
LE_ID in (8277, 8276, 8232)
)
GROUP BY
LE_AGENDA_FK,
LE_CODE,
LE_ID,
LE_KLAS_FK,
LE_KLASPARTITIE_FK,
LE_OMSCHRIJVING,
LE_VERANDERDDOOR,
LE_VERANDERDOP
),
LESEENHEIDLOOPBAANNULL AS (
SELECT
LE_AGENDA_FK,
LE_CODE,
LE_ID,
LE_KLAS_FK,
LE_KLASPARTITIE_FK,
LE_OMSCHRIJVING,
LE_VERANDERDDOOR,
LE_VERANDERDOP,
0 As AantalLln
FROM
LESEENHEID
where
LE_ID in (8277, 8276, 8232)
and EXISTS (
SELECT
*
FROM
LESEENHEIDLOOPBAAN
)
)
SELECT
*
FROM
LESEENHEIDLOOPBAAN
UNION
SELECT
*
FROM
LESEENHEIDLOOPBAANNULL ROWS 1000
Try this out using ROW_NUMBER:
SELECT * FROM (
SELECT
ROW_NUMBER () OVER (PARTITION BY LW_ID ORDER BY AANTALLN DESC) AS RN
,* FROM
(
SELECT * FROM
LESEENHEIDLOOPBAAN
UNION
SELECT
*
FROM
LESEENHEIDLOOPBAANNULL ROWS 1000
)
)
) WHERE RN = 1
This way you eliminate the duplicates.

Where clause on Running total

I have this table which stores containers by region and the number of coffee pouches in each of the containers.
if object_id( 'dbo.Container' ) is not null
drop table dbo.Container
go
create table dbo.Container
(
Id int not null,
Region int not null,
NumberOfCoffeePouches int not null,
constraint pkc_Container__Id primary key clustered(Id asc)
)
go
insert into dbo.Container
( Id , Region , NumberOfCoffeePouches )
values
( 1, 1, 10 ),
( 2, 1, 30 ),
( 3, 1, 5),
( 4, 1, 7),
( 5, 1, 1),
( 6, 1, 3),
( 7, 2, 4),
( 8, 2, 4),
( 9, 2, 4)
I need to list out the container Ids that will be used to fulfill an order of, say 50, coffee pouches. Over supplying is OK.
Here is query I have come up with
declare #RequiredCoffeePouches int = 50
select
sq2.Id,
sq2.NumberOfCoffeePouches,
sq2.RunningTotal,
sq2.LagRunningTotal
from
(
select
sq1.Id,
sq1.NumberOfCoffeePouches,
sq1.RunningTotal,
lag(sq1.RunningTotal, 1, 0) over (order by sq1.Id asc)
as 'LagRunningTotal'
from
(
select
c.Id,
c.NumberOfCoffeePouches,
sum(c.NumberOfCoffeePouches)
over (order by c.Id asc) as 'RunningTotal'
from
dbo.Container as c
where
c.Region = 1
) as sq1
) as sq2
where
sq2.LagRunningTotal <= #RequiredCoffeePouches
It gives the expected result
Id NumberOfCoffeePouches RunningTotal LagRunningTotal
----------- --------------------- ------------ ---------------
1 10 10 0
2 30 40 10
3 5 45 40
4 7 52 45
Question:
Is there a better and more optimized way to achieve this?
Specially the Container table is very large table and I think the sub query sq1 will unnecessarily calculate the RunningTotals for all the containers in the region. I was wondering if there is anyway to have sq1 stop processing more rows once the RunnningTotal exceeds over the #RequiredCoffeePouches.
Two things:
Moving your WHERE clause inside of the relevant sub-select can greatly increase the speed of the query because it'll pull less data. Using your example:
SELECT
sq2.Id,
sq2.NumberOfCoffeePouches,
sq2.RunningTotal,
sq2.LagRunningTotal
FROM
(
SELECT
sq1.Id,
sq1.NumberOfCoffeePouches,
sq1.RunningTotal,
lag(sq1.RunningTotal, 1, 0) over (order by sq1.Id asc) AS 'LagRunningTotal'
FROM
(
SELECT
c.Id,
c.NumberOfCoffeePouches,
SUM(c.NumberOfCoffeePouches) OVER (order by c.Id asc) AS 'RunningTotal'
FROM dbo.Container AS c
WHERE c.Region = 1
) AS sq1
WHERE sq2.LagRunningTotal <= #RequiredCoffeePouches
) AS sq2
CTEs can also improve performance:
;WITH sql1CTE AS (
SELECT
c.Id,
c.NumberOfCoffeePouches,
SUM(c.NumberOfCoffeePouches) OVER (order by c.Id asc) AS 'RunningTotal'
FROM dbo.Container AS c
WHERE c.Region = 1
),
sql2CTE AS (
SELECT
Id,
NumberOfCoffeePouches,
RunningTotal,
lag(RunningTotal, 1, 0) over (order by Id asc) AS 'LagRunningTotal'
FROM sql1CTE
WHERE LagRunningTotal <= #RequiredCoffeePouches
)
SELECT
Id,
NumberOfCoffeePouches,
RunningTotal,
LagRunningTotal
FROM sql2CTE
SQL Server CTE Basics
If you're using SSMS, select "Include Client Statistics" and "Include Actual Execution Plan" to keep track of how your query performs while you're crafting it.

SQL reporting query

I have a database with following structure.
CREATE TABLE Party
(
PartyID INT IDENTITY
PRIMARY KEY ,
StatusID INT ,
Weigth INT ,
OldWeigth INT
);
GO
CREATE TABLE PartyLocation
(
PartyLocationID INT IDENTITY
PRIMARY KEY ,
PartyID INT FOREIGN KEY REFERENCES dbo.Party ( PartyID ) ,
LocationID INT ,
Distance INT
);
GO
CREATE TABLE PartyRole
(
PartyRoleID INT IDENTITY
PRIMARY KEY ,
PartyID INT FOREIGN KEY REFERENCES dbo.Party ( PartyID ) ,
RoleID INT
);
with some simple data.
INSERT INTO dbo.Party
( StatusID, Weigth, OldWeigth )
VALUES ( 1, -- StatusID - int
10, -- Age - int
20 -- OldAge - int
),
( 1, 15, 25 ),
( 2, 20, 30 );
INSERT INTO dbo.PartyLocation
( PartyID, LocationID, Distance )
VALUES ( 1, -- PartyID - int
1, -- LocationID - int
100 -- Distance - int
),
( 1, 2, 200 ),
( 1, 3, 300 ),
( 2, 1, 1000 ),
( 2, 2, 2000 ),
( 3, 1, 10000 );
INSERT INTO dbo.PartyRole
( PartyID, RoleID )
VALUES ( 1, -- PartyID - int
1 -- RoleID - int
),
( 1, 2 ),
( 1, 3 ),
( 2, 1 ),
( 2, 2 ),
( 3, 1 );
I want to query the following information
Return sum of Weigth of all parties that has roleID = 1 in PartyRole table
Return sum of OldWeigth of all parties that has statusID = 2
Return sum of distances of all parties that has locationID = 3
Return sum of distances of all parties that has roleID = 2
So the expected results are
FilteredWeigth FilteredOldWeigth FilteredDistance AnotherFilteredDistance
-------------- ----------------- ---------------- -----------------------
45 30 600 3600
Can we write a query that will query each table just once? If no what will be the most optimal way to query the data?
You can try this.
SELECT
FilteredWeigth = SUM(CASE WHEN RoleID = 1 AND RN_P = 1 THEN Weigth END) ,
FilteredOldWeigth = SUM(CASE WHEN StatusID = 2 AND RN_P = 1 THEN OldWeigth END),
FilteredDistance = SUM(CASE WHEN LocationID = 3 AND RN_L = 1 THEN Distance END),
AnotherFilteredDistance = SUM(CASE WHEN RoleID = 2 THEN Distance END)
FROM (
SELECT P.Weigth, P.StatusID, P.OldWeigth, PL.LocationID, PL.Distance, PR.RoleID,
RN_P = ROW_NUMBER() OVER (PARTITION BY P.PartyID ORDER BY PL.PartyLocationID),
RN_L = ROW_NUMBER() OVER (PARTITION BY PL.LocationID ORDER BY PR.PartyRoleID)
FROM Party P
INNER JOIN PartyLocation PL ON P.PartyID = PL.PartyID
INNER JOIN PartyRole PR ON P.PartyID = PR.PartyID
) AS T
the below gives
45 20 300 3600
the third column gives 300 which does not correspond to your expected result.
with q1
as
(
select sum(weigth) FilteredWeigth
from party join partyrole on party.partyid = partyrole.partyid
where partyrole.RoleID = '1'
),
q2 as
(
select sum(weigth) OldWeigth from party where StatusID = '2'
),
q3 as (
select sum(Distance) FilteredDistance
from party join PartyLocation on party.partyid = PartyLocation.partyid
where PartyLocation.locationID = '3'
),
q4 as
(
select sum(Distance) AnotherFilteredDistance
from party join partyrole on party.partyid = partyrole.partyid
join PartyLocation on party.partyid = PartyLocation.partyid
where partyrole.RoleID = '2'
)
select FilteredWeigth,OldWeigth,FilteredDistance,AnotherFilteredDistance
from q1,q2,q3,q4
When Using Individual Queries, you can achieve this using the following
Return sum of Weight of all parties that has roleID = 1 in PartyRole table
SELECT
SUM(Weight) FilteredWeigth
FROM dbo.Party P
WHERE EXISTS
(
SELECT
1
FROM dbo.PartyRole PR
WHERE PR. PartyID = P.PartyID
AND PR.RoleId = 1
)
Return sum of OldWeigth of all parties that has statusID = 2
SELECT
SUM(OldWeigth) FilteredOldWeigth
FROM dbo.Party P
WHERE EXISTS
(
SELECT
1
FROM dbo.PartyRole PR
WHERE PR. PartyID = P.PartyID
AND PR.RoleId = 2
)
Return sum of distances of all parties that has locationID = 3
SELECT
SUM(Distance) FilteredDistance
FROM dbo.PartyLocation
WHERE LocationID = 3
Return sum of distances of all parties that has roleID = 2
SELECT SUM(Distance) FROM PartyLocation PL
WHERE EXISTS
(
SELECT 1 FROM PartyRole PR
WHERE PR.PartyID = PL.PartyID
AND PR.Roleid = 2
)
If you want to get the result of all these in a single result set. then maybe you can try a pivot query. Like this
WITH CTE
AS
(
SELECT
'FilteredWeigth' ColNm,
SUM(Weigth) Val
FROM dbo.Party P
WHERE EXISTS
(
SELECT
1
FROM dbo.PartyRole PR
WHERE PR. PartyID = P.PartyID
AND PR.RoleId = 1
)
UNION
SELECT
'FilteredOldWeigth' ColNm,
SUM(OldWeigth) Val
FROM dbo.Party P
WHERE EXISTS
(
SELECT
1
FROM dbo.PartyRole PR
WHERE PR. PartyID = P.PartyID
AND PR.RoleId = 2
)
UNION
SELECT
'FilteredDistance' ColNm,
SUM(Distance) Val
FROM dbo.PartyLocation
WHERE LocationID = 3
UNION
SELECT
'AnotherFilteredDistance' ColNm,
SUM(Distance) Val FROM PartyLocation PL
WHERE EXISTS
(
SELECT 1 FROM PartyRole PR
WHERE PR.PartyID = PL.PartyID
AND PR.Roleid = 2
)
)
SELECT
*
FROM CTE
PIVOT
(
SUM(Val)
FOR ColNm IN
(
[FilteredWeigth],[FilteredOldWeigth],[FilteredDistance],[AnotherFilteredDistance]
)
)Pvt
The Result Will be
I could think of only three possible options:
Union query with four different select statements as answered by #ab-bennett
Join all tables then use select statements as answered by sarslan
Mix of 1 and 2, based on experiments
Coming to the question you asked:
Can we write a query that will query each table just once?
Assuming best performance is the goal, following could happen in each of the above cases:
All select statements would have their own where clause. This would perform best when where produces few rows compared to the count(*). Note that Joins are terrible for very large tables.
A join is made once, and the desired output is obtained from the same Joined table. This would perform optimal when where produces significant number of rows and the table is not too big to join.
You can mix JOIN / IN / EXISTS / WHERE to optimize your queries based on number of rows you are having in table. This approach could be used when your dataset cardinality might not vary a lot.

Remove + - value records in SQL where clause

I need to remove the + - values records mean to say
I need only Blue colored two records from the output windows.
Hope its clear what exactly I want.
User5 | -15
User6 | -10
The idea is to get rows whose second column, in my case it's Val, is are cancelled out. You can do it by getting the absolute value and assign a row number grouped by absolute value and the value itself. Those row number that does not have a match should be the result.
WITH SampleData(UserID, Val) AS(
SELECT 'User1', -10 UNION ALL
SELECT 'User2', 10 UNION ALL
SELECT 'User3', -15 UNION ALL
SELECT 'User4', -10 UNION ALL
SELECT 'User5', -15 UNION ALL
SELECT 'User6', -10 UNION ALL
SELECT 'User7', 10 UNION ALL
SELECT 'User8', 15
)
,Numbered AS(
SELECT
UserID,
Val,
BaseVal = ABS(Val),
RN = ROW_NUMBER() OVER(PARTITION BY ABS(Val), Val ORDER BY UserId)
FROM SampleData
)
SELECT
n1.UserID,
n1.Val
FROM Numbered n1
LEFT JOIN Numbered n2
ON n2.BaseVal = n1.BaseVal
AND n2.RN = n1.rn
AND n2.UserID <> n1.UserID
WHERE n2.UserID IS NULL
ORDER BY n1.UserID
Appears that you want rows where the total does not equal 0?
select
userName,
userValue
from
yourTable
where
userName in (
select userName from yourTable
group by userName
having sum (userValue) <> 0
)

Find overlapping sets of data in a table

I need to identify duplicate sets of data and give those sets who's data is similar a group id.
id threshold cost
-- ---------- ----------
1 0 9
1 100 7
1 500 6
2 0 9
2 100 7
2 500 6
I have thousands of these sets, most are the same with different id's. I need find all the like sets that have the same thresholds and cost amounts and give them a group id. I'm just not sure where to begin. Is the best way to iterate and insert each set into a table and then each iterate through each set in the table to find what already exists?
This is one of those cases where you can try to do something with relational operators. Or, you can just say: "let's put all the information in a string and use that as the group id". SQL Server seems to discourage this approach, but it is possible. So, let's characterize the groups using:
select d.id,
(select cast(threshold as varchar(8000)) + '-' + cast(cost as varchar(8000)) + ';'
from data d2
where d2.id = d.id
for xml path ('')
order by threshold
) as groupname
from data d
group by d.id;
Oh, I think that solves your problem. The groupname can serve as the group id. If you want a numeric id (which is probably a good idea, use dense_rank():
select d.id, dense_rank() over (order by groupname) as groupid
from (select d.id,
(select cast(threshold as varchar(8000)) + '-' + cast(cost as varchar(8000)) + ';'
from data d2
where d2.id = d.id
for xml path ('')
order by threshold
) as groupname
from data d
group by d.id
) d;
Here's the solution to my interpretation of the question:
IF OBJECT_ID('tempdb..#tempGrouping') IS NOT NULL DROP Table #tempGrouping;
;
WITH BaseTable AS
(
SELECT 1 id, 0 as threshold, 9 as cost
UNION SELECT 1, 100, 7
UNION SELECT 1, 500, 6
UNION SELECT 2, 0, 9
UNION SELECT 2, 100, 7
UNION SELECT 2, 500, 6
UNION SELECT 3, 1, 9
UNION SELECT 3, 100, 7
UNION SELECT 3, 500, 6
)
, BaseCTE AS
(
SELECT
id
--,dense_rank() over (order by threshold, cost ) as GroupId
,
(
SELECT CAST(TblGrouping.threshold AS varchar(8000)) + '/' + CAST(TblGrouping.cost AS varchar(8000)) + ';'
FROM BaseTable AS TblGrouping
WHERE TblGrouping.id = BaseTable.id
ORDER BY TblGrouping.threshold, TblGrouping.cost
FOR XML PATH ('')
) AS MultiGroup
FROM BaseTable
GROUP BY id
)
,
CTE AS
(
SELECT
*
,DENSE_RANK() OVER (ORDER BY MultiGroup) AS GroupId
FROM BaseCTE
)
SELECT *
INTO #tempGrouping
FROM CTE
-- SELECT * FROM #tempGrouping;
UPDATE BaseTable
SET BaseTable.GroupId = #tempGrouping.GroupId
FROM BaseTable
INNER JOIN #tempGrouping
ON BaseTable.Id = #tempGrouping.Id
IF OBJECT_ID('tempdb..#tempGrouping') IS NOT NULL DROP Table #tempGrouping;
Where BaseTable is your table, and and you don't need the CTE "BaseTable", because you have a data table.
You may need to take extra-precautions if your threshold and cost fields can be NULL.