Update value based on value from another record of same table - sql

Here I have a sample table of a website visitors. As we can see, sometimes visitor don't provide their email. Also they may switch to different email addresses over period.
**
Original table:
**
I want to update this table with following requirements:
First time when a visitor provides an email, all his past visits will be tagged to that email
Also all his future visits will be tag to that email until he switches to another email.
**
Expected table after update:
**
I was wondering if there is a way of doing it in Redshift or T-Sql?
Thanks everyone!

In SQL Server or Redshift, you can use a subquery to calculate the email:
select t.*,
coalesce(email,
max(email) over (partition by visitor_id, grp),
max(case when activity_date = first_email_date then email end) over (partition by visitor_id)
)
from (select t.*,
min(case when email is not null then activity_date end) over
(partition by visitor_id order by activity_date rows between unbounded preceding and current row) as first_email_date,
count(email) over (partition by visitor_id order by activity_date between unbounded preceding and current row) as grp
from t
) t;
You can then use this in an update:
update t
set emai = tt.imputed_email
from (select t.,
coalesce(email,
max(email) over (partition by visitor_id, grp),
max(case when activity_date = first_email_date then email end) over (partition by visitor_id)
) as imputed_email
from (select t.,
min(case when email is not null then activity_date end) over
(partition by visitor_id order by activity_date) as first_email_date,
count(email) over (partition by visitor_id order by activity_date) as grp
from t
) t
) tt
where tt.visitor_id = t.visitor_id and tt.activity_date = t.activity_date and
t.email is null;

If we suppose that the name of the table is Visits and the primary key of that table is made of the columns Visitor_id and Activity_Date then you can do in T-SQL following:
using correlated subquery:
update a
set a.Email = coalesce(
-- select the email used previously
(
select top 1 Email from Visits
where Email is not null and Activity_Date < a.Activity_Date and Visitor_id = a.Visitor_id
order by Activity_Date desc
),
-- if there was no email used previously then select the email used next
(
select top 1 Email from Visits
where Email is not null and Activity_Date > a.Activity_Date and Visitor_id = a.Visitor_id
order by Activity_Date
)
)
from Visits a
where a.Email is null;
using window function to provide the ordering:
update v
set Email = vv.Email
from Visits v
join (
select
v.Visitor_id,
coalesce(a.Email, b.Email) as Email,
v.Activity_Date,
row_number() over (partition by v.Visitor_id, v.Activity_Date
order by a.Activity_Date desc, b.Activity_Date) as Row_num
from Visits v
-- previous visits with email
left join Visits a
on a.Visitor_id = v.Visitor_id
and a.Email is not null
and a.Activity_Date < v.Activity_Date
-- next visits with email if there are no previous visits
left join Visits b
on b.Visitor_id = v.Visitor_id
and b.Email is not null
and b.Activity_Date > v.Activity_Date
and a.Visitor_id is null
where v.Email is null
) vv
on vv.Visitor_id = v.Visitor_id
and vv.Activity_Date = v.Activity_Date
where
vv.Row_num = 1;

For each visitor_id you can update the null email value with the previus non-null value. In case there is none, you will use the next non-null value.You can get those values as follows:
select
v.*, v_prev.email prev_email, v_next.email next_email
from
visits v
left join visits v_prev on v.visitor_id = v_prev.visitor_id
and v_prev.activity_date = (select max(v2.activity_date) from visits v2 where v2.visitor_id = v.visitor_id and v2.activity_date < v.activity_date and v2.email is not null)
left join visits v_next on v.visitor_id = v_next.visitor_id
and v_next.activity_date = (select min(v2.activity_date) from visits v2 where v2.visitor_id = v.visitor_id and v2.activity_date > v.activity_date and v2.email is not null)
where
v.email is null

Related

PostgresSQL/SQL Query

I want to get the activity_id of the row of every first "email" activity that happened in between the "completed_order" activity in a column "first_in_between"
I wrote this query
SELECT activity_id, customer , activity, ts,
case
when
activity = 'completed_order' and lead(activity) over (partition by customer order by ts) ='email'
then
lead(activity_id) over (partition by customer order by ts)
end as First_in_between
from activity_stream where customer = 'Lehmanns Marktstand'
order by ts
With the above Query, I am getting this result.
My Desired Results should be
You can readily get the timestamp of the email using:
select activity_id, customer , activity, ts,
(case when activity = 'completed_order' and
(min(ts) filter (where activity = 'email') over (partition by customer order by ts desc) <
min(ts) filter (where activity = 'completed_order') over (partition by customer order by ts desc)
)
then min(ts) filter (where activity = 'email') over (partition by customer order by ts desc)
end) as First_in_between
from activity_stream
where customer = 'Lehmanns Marktstand'
order by ts;
You can then join back to the table or use another level of window functions to get the corresponding activity_id for the timestamp.
Actually, I think I prefer another method, which is just to count the number of completed orders and then take the minimum ts:
select a.*,
min(ts) filter (where activity = 'email') over (partition by grp) as email_ts
from (select a.*,
count(*) filter (where activity = 'completed_order') over (partition by customer order by ts) as grp
from activity_stream a
where customer = 'Lehmanns Marktstand'
) a;
This should also allow you to use a twist to get the activity id without an additional subquery:
select a.*,
(array_agg(activity_id order by ts) filter (where activity = 'email') over (partition by grp))[1] as email_activity_id
from (select a.*,
count(*) filter (where activity = 'completed_order') over (partition by grp order by ts) as grp
from activity_stream a
where customer = 'Lehmanns Marktstand'
) a

Convert CTE Query into normal Query

I want to convert my #PostgreSQL, CTE Query, into Normal Query because the cte function is mainly used in data warehouse SQL and not efficient for Postgres production DBS.
So, need help in converting this CTE query into a normal Query
WITH
cohort AS (
SELECT
*
FROM (
select
activity_id,
ts,
customer,
activity,
case
when activity = 'completed_order' and lag(activity) over (partition by customer order by ts) != 'email'
then null
when activity = 'email' and lag(activity) over (partition by customer order by ts) !='email'
then 1
else 0
end as cndn
from activity_stream where customer in (select customer from activity_stream where activity='email')
order by ts
) AS s
)
(
select
*
from cohort as s
where cndn = 1 OR cndn is null order by ts)
You may just inline the CTE into your outer query:
select *
from
(
select activity_id, ts, customer, activity,
case when activity = 'completed_order' and lag(activity) over (partition by customer order by ts) != 'email'
then null
when activity = 'email' and lag(activity) over (partition by customer order by ts) !='email'
then 1
else 0
end as cndn
from activity_stream
where customer in (select customer from activity_stream where activity = 'email')
) as s
where cndn = 1 OR cndn is null
order by ts;
Note that you have an unnecessary subquery in the CTE, which does an ORDER BY which won't "stick" anyway. But other than this, you might want to keep your current code as is.

I need to write a query to mark previous record as “Not eligible ” if a new record comes in within 30 days with same POS Order ID

I have a requirement to write a query to retrieve the records which have POS_ORDER_ID in the table with same POS_ORDER_ID which comes within 30days as new record with status 'Canceled', 'Discontinued' and need to mark previous POS_ORDER_ID record as it as not eligible
Table columns:
POS_ORDER_ID,
Status,
Order_date,
Error_description
A query containing MAX() and ROW_NUMBER() analytic functions might help you such as :
with t as
(
select t.*,
row_number() over (partition by pos_order_id order by Order_date desc ) as rn,
max(Order_date) over (partition by pos_order_id) as mx
from tab t -- your original table
)
select pos_order_id, Status, Order_date, Error_description,
case when rn >1
and t.status in ('Canceled','Discontinued')
and mx - t.Order_date <= 30
then
'Not eligible'
end as "Extra Status"
from t
Demo
Please use below query,
Select and validate
select POS_ORDER_ID, Status, Order_date, Error_description, row_number()
over(partition by POS_ORDER_ID order by Order_date desc)
from table_name;
Update query
merge into table_name t1
using
(select row_id, POS_ORDER_ID, Status, Order_date, Error_description,
row_number() over(partition by POS_ORDER_ID order by Order_date desc) as rnk
from table_name) t2
on (t1.POS_ORDER_ID = t2.POS_ORDER_ID and t1.row_id = t2.row_id)
when matched then
update
set
case when t2.rnk = 1 then 'Canceled' else 'Not Eligible';

Oracle SQL function or buckets for data filtering

SELECT
transaction
,date
,mail
,status
,ROW_NUMBER() OVER (PARTITION BY mail ORDER BY date) AS rownum
FROM table1
Having the above table and script I want to be able to filter the transactions on the basis of having first 3 rowids with status 'failed' to show rowid 4 if 'failed', having transactions with rowid 4,5,6 failed - show 7 if also failed etc. I was thinking about adding it to a pandas dataframe where to run a simple lambda function , but would really like to find a solution in SQL only.
You could use lead() and lag() to explicitly check:
select t.*
from (select t1.*,
lag(status, 3) over (partition by mail order by date) as status_3,
lag(status, 3) over (partition by mail order by date) as status_2,
lag(status, 3) over (partition by mail order by date) as status_1,
lead(status, 1) over (partition by mail order by date) as status_3n,
lead(status, 2) over (partition by mail order by date) as status_2n,
lead(status, 3) over (partition by mail order by date) as status_3n
from t
) t
where status = 'FAILED' and
( (status_3 = 'FAILED' and status_2 = 'FAILED' and status_1 = 'FAILED') or
(status_2 = 'FAILED' and status_1 = 'FAILED' and status_1n = 'FAILED') or
(status_1 = 'FAILED' and status_1n = 'FAILED' and status_2n = 'FAILED') or
(status_1n = 'FAILED' and status_2n = 'FAILED and status_3n = 'FAILED')
)
This is a bit brute force, but I think the logic is quite clear.
You could simplify the logic to:
where regexp_like(status_3 || status_2 || status_1 || status || status_1n || status_2n || status3n,
'FAILED{4}'
)
Try this:
select * from (
SELECT
transaction
,date
,mail
,status
,ROW_NUMBER() OVER (PARTITION BY mail ORDER BY date) AS rownum
FROM table1
WHERE status = 'FAILED' )
where mod(rownum, 3) = 1;
Richard
One option is to use window functions. Use lag to get the previous status value (based on specified ordering) and compare it with the current row's value and assign groups with a running sum. Then count the values in each group and finally filter for that condition.
SELECT t.*
FROM
( SELECT t.*,
count(*) over(PARTITION BY mail, grp) AS grp_count
FROM
( SELECT t.*,
sum(CASE
WHEN (prev_status IS NULL AND status='FAILED') OR
(prev_status='FAILED' AND status='FAILED') THEN 0
ELSE 1
END) over(PARTITION BY mail ORDER BY "date","transaction") AS grp
FROM
( SELECT t.*,
lag(status) over(PARTITION BY mail ORDER BY "date","transaction") AS prev_status
FROM tbl t
) t
) t
) t
WHERE grp_count>=4
If you are using versions starting with Oracle 12c, there is an option to use MATCH_RECOGNIZE which would simplify this.
select *
from tbl
MATCH_RECOGNIZE (
PARTITION BY mail
ORDER BY "date" ,"transaction"
ALL ROWS PER MATCH
AFTER MATCH SKIP TO LAST FAIL
PATTERN(fail{4,})
DEFINE
fail AS (status='FAILED')
) MR
ORDER BY "date","transaction"

Getting the value of a previous record using ROW_NUMBER() in SQL Server

Hopefully this is easy enough for those more experienced in SQL Server.
I have a table to customer loan activity data which is updated whenever an action happens on their account. For example if their limit is increased, a new record will be created with their new limit. I want to be able to create a listing of their activity where the activity amount is their new limit subtracting whatever their previous limit was.
At the moment I have the following but I'm struggling to work out how to access that previous record.
SELECT
CUSTOMER
,LEDGER
,ACCOUNT
,H.AMOUNT - COALESCE(X.AMOUNT, 0)
FROM
dbo.ACTIVITY H WITH (NOLOCK)
LEFT OUTER JOIN
(SELECT
CUSTOMER
,LEDGER
,ACCOUNT
,ACTIVITY_DATE
,AMOUNT
,ROW_NUMBER() OVER (PARTITION BY CUSTOMER, LEDGER, ACCOUNT ORDER BY ACTIVITY_DATE ASC) AS ROW_NUMBER
FROM
dbo.ACTIVITY WITH (NOLOCK)) X ON H.CUSTOMER = X.CUSTOMER
AND H.LEDGER = X.LEDGER
AND H.ACCOUNT = X.ACCOUNT
So basically I only want to subtract x.amount if it's the previous record but I'm not sure how to do this when I don't know what day it happened.
I thought Row_Number() would help me but I'm still a bit stumped.
Hope you hear from you all soon :)
Cheers
Here's a query that will only pass through dbo.Activity ONCE
SELECT H.CUSTOMER
,H.LEDGER
,H.ACCOUNT
,MAX(H.ACTIVITY_DATE) ACTIVITY_DATE
,SUM(CASE X.I WHEN 1 THEN AMOUNT ELSE -AMOUNT END) AMOUNT
FROM (SELECT CUSTOMER
,LEDGER
,ACCOUNT
,ACTIVITY_DATE
,AMOUNT
,ROW_NUMBER() OVER (PARTITION BY CUSTOMER, LEDGER, ACCOUNT ORDER BY ACTIVITY_DATE DESC) AS ROW_NUMBER
FROM dbo.ACTIVITY WITH (NOLOCK)
) H
CROSS JOIN (select 1 union all select 2) X(I)
WHERE ROW_NUMBER - X.I >= 0
GROUP BY H.CUSTOMER
,H.LEDGER
,H.ACCOUNT
,ROW_NUMBER - X.I;
And here's the DDL/DML for some data I used to test
CREATE TABLE dbo.ACTIVITY(CUSTOMER int, LEDGER int, ACCOUNT int, ACTIVITY_DATE datetime, AMOUNT int)
INSERT dbo.ACTIVITY select
1,2,3,GETDATE(),123 union all select
1,2,3,GETDATE()-1,16 union all select
1,2,3,GETDATE()-2,12 union all select
1,2,3,GETDATE()-3,1 union all select
4,5,6,GETDATE(),1000 union all select
4,5,6,GETDATE()-6,123 union all select
7,7,7,GETDATE(),99;
Alternatives
A more traditional approach using a subquery to get the previous row:
SELECT CUSTOMER, LEDGER, ACCOUNT, ACTIVITY_DATE,
AMOUNT - ISNULL((SELECT TOP(1) I.AMOUNT
FROM dbo.ACTIVITY I
WHERE I.CUSTOMER = O.CUSTOMER
AND I.LEDGER = O.LEDGER
AND I.ACCOUNT = O.ACCOUNT
AND I.ACTIVITY_DATE < O.ACTIVITY_DATE
ORDER BY I.ACTIVITY_DATE DESC), 0) AMOUNT
FROM dbo.ACTIVITY O
ORDER BY CUSTOMER, LEDGER, ACCOUNT, ACTIVITY_DATE;
Or ROW_NUMBER() the data twice and join between them
SELECT A.CUSTOMER, A.LEDGER, A.ACCOUNT, A.ACTIVITY_DATE,
A.AMOUNT - ISNULL(B.AMOUNT,0) AMOUNT
FROM (SELECT *, RN=ROW_NUMBER() OVER (partition by CUSTOMER, LEDGER, ACCOUNT
order by ACTIVITY_DATE ASC)
FROM dbo.ACTIVITY) A
LEFT JOIN (SELECT *, RN=ROW_NUMBER() OVER (partition by CUSTOMER, LEDGER, ACCOUNT
order by ACTIVITY_DATE ASC)
FROM dbo.ACTIVITY) B ON A.CUSTOMER = B.CUSTOMER
AND A.LEDGER = B.LEDGER
AND A.ACCOUNT = B.ACCOUNT
AND B.RN = A.RN-1 -- prior record
ORDER BY A.CUSTOMER, A.LEDGER, A.ACCOUNT, A.ACTIVITY_DATE;