time difference between events in two different tables - sql

I have a paid_activity and free_activity table:
paid_activity model:
Field
Type
Description
pk
number
hash(activity_date, user_id, product_id)
date
date
date of the activity
uid
number
user who generated the activity
pid
number
product associated with the activity
free_activity model:
Field
Type
Description
pk
number
hash(activity_date, user_id, product_id)
date
date
date of the activity
uid
number
user who generated the activity
pid
number
product associated with the activity
I need to produce a dormancy table with the model:
Field
Type
Description
pk
number
hash(activity_date, user_id, product_id)
date
date
date of the spend activity
uid
number
user who generated the activity
pid
number
product associated with the activity
paid_dormancy
int
days since the user's last paid activity
paid_product_dormancy
int
days since the user's last paid activity with the same product
free_dormancy
int
days since the user's last free activity
free_product_dormancy
int
days since the user's last free activity with the same product
The dormancy table should have a 1:1 row correspondence to the paid_activity table.
I started by making an intermediate paid_dormancy table without the free_ fields:
Field
Type
Description
pk
number
hash(activity_date, user_id, product_id)
date
date
date of the spend activity
uid
number
user who generated the activity
pid
number
product the user spent on
paid_dormancy
int
days since the user's last paid activity
paid_product_dormancy
int
days since the user's last free activity with the same product
Code:
select
pk
, date
, uid
, pid
, date - lag(date) ignore nulls over(
partition by uid
order by date) paid_dormancy
, date - lag(date) ignore nulls over(
partition by uid, pid
order by date) paid_product_dormancy
from paid_activity
I'm having trouble figuring out how to merge in free_activity to build the dormancy table.
I cobbled together the following query which doesn't produce the correct results at all:
with dormancy_union as (
select
ppk pk
, date
, uid
, pid
, paid_dormancy
, paid_product_dormancy
, iff(ppk is null, null,
date - lag(date) ignore nulls over(
partition by uid
order by fpk, date
)) free_dormancy
, iff(ppk is null, null,
date - lag(date) ignore nulls over(
partition by uid, pid
order by fpk, date
)) free_product_dormancy
from (
select pk ppk, null fpk, * from paid_dormancy
union all
select null ppk, pk fpk, *, null, null from free_activity
)
select *
from dormancy_union
where pk is not null
order by date;

So with some example data:
with paid_activity(pk, date, uid, pid) as (
select
hash(column1, column2, column3),
to_date(column1, 'yyyy-mm-dd'),
column2,
column3
from values
('2023-02-14', 1, 10),
('2023-02-01', 1, 11),
('2023-01-13', 1, 10)
), free_activity(pk, date, uid, pid) as (
select
hash(column1, column2, column3),
to_date(column1, 'yyyy-mm-dd'),
column2,
column3
from values
('2023-02-15', 1, 10),
('2023-02-11', 1, 10),
('2023-01-20', 1, 10),
('2023-01-01', 1, 10)
)
we can build the priors for the free products by do a date join, and then keep the "most recent" per input rows..
), with_priors as (
select z.*
,fp.date as fp_date
from (
select p.*
,f.date as f_date
from paid_activity as p
left join free_activity as f
on f.date <= p.date
and p.uid = f.uid
qualify row_number() over (partition by p.date, p.uid, p.pid order by f.date desc) = 1
) as z
left join free_activity as fp
on fp.date <= z.date
and fp.uid = z.uid
and fp.pid = z.pid
qualify row_number() over (partition by z.date, z.uid, z.pid order by fp.date desc) = 1
)
select
p.*
,lag(p.date)ignore nulls over(partition by p.uid order by p.date) as l_p_act
,lag(p.date)ignore nulls over(partition by p.uid, p.pid order by p.date) as l_pp_act
,datediff('days', lag(p.date)ignore nulls over(partition by p.uid order by p.date), p.date) as paid_dormancy
,datediff('days', lag(p.date)ignore nulls over(partition by p.uid, p.pid order by p.date), p.date) as paid_product_dormancy
,datediff('days', p.f_date, p.date) as free_dormancy
,datediff('days', p.fp_date, p.date) as free_product_dormancy
from with_priors as p
order by 2,3,4;
PK
DATE
UID
PID
F_DATE
FP_DATE
L_P_ACT
L_PP_ACT
PAID_DORMANCY
PAID_PRODUCT_DORMANCY
FREE_DORMANCY
FREE_PRODUCT_DORMANCY
9,052,164,364,143,044,634
2023-01-13
1
10
2023-01-01
2023-01-01
12
12
3,292,356,339,691,413,099
2023-02-01
1
11
2023-01-20
2023-01-13
19
12
-3,195,136,054,197,415,933
2023-02-14
1
10
2023-02-11
2023-02-11
2023-02-01
2023-01-13
13
32
3
3
so that last block can be cleaner:
select
p.pk, p.date, p.uid, p.pid
,datediff('days', lag(p.date)ignore nulls over(partition by p.uid order by p.date), p.date) as paid_dormancy
,datediff('days', lag(p.date)ignore nulls over(partition by p.uid, p.pid order by p.date), p.date) as paid_product_dormancy
,datediff('days', p.f_date, p.date) as free_dormancy
,datediff('days', p.fp_date, p.date) as free_product_dormancy
from with_priors as p
order by 2,3,4;
giving:
PK
DATE
UID
PID
PAID_DORMANCY
PAID_PRODUCT_DORMANCY
FREE_DORMANCY
FREE_PRODUCT_DORMANCY
9,052,164,364,143,044,634
2023-01-13
1
10
12
12
3,292,356,339,691,413,099
2023-02-01
1
11
19
12
-3,195,136,054,197,415,933
2023-02-14
1
10
13
32
3
3

Related

Top 2 per month in SQL

I have this dataset, which has dates and products for cities:
CREATE TABLE my_table (
the_id varchar(5) NOT NULL,
the_date timestamp NOT NULL,
the_city varchar(5) NOT NULL,
the_product varchar(1) NOT NULL
);
INSERT INTO my_table
VALUES ('VIS01', '2019-05-02 09:00:00','LISBO','A'),
('VIS02', '2019-05-04 12:00:00','EVORA','A'),
('VIS03', '2019-05-05 18:00:00','LISBO','B'),
('VIS04', '2019-05-06 18:30:00','PORTO','B'),
('VIS05', '2019-05-15 12:05:00','PORTO','C'),
('VIS06', '2019-06-02 18:06:00','EVORA','C'),
('VIS07', '2019-06-02 18:07:00','PORTO','A'),
('VIS08', '2019-06-04 18:08:00','EVORA','B'),
('VIS09', '2019-06-07 18:09:00','LISBO','B'),
('VIS10', '2019-06-09 18:10:00','LISBO','D'),
('VIS11', '2019-06-12 18:11:00','EVORA','D'),
('VIS12', '2019-06-15 18:12:00','LISBO','E'),
('VIS13', '2019-06-15 18:13:00','EVORA','F'),
('VIS14', '2019-06-18 18:14:00','PORTO','G'),
('VIS15', '2019-06-23 18:15:00','LISBO','A'),
('VIS16', '2019-06-25 18:16:00','LISBO','A'),
('VIS17', '2019-06-27 18:17:00','LISBO','F'),
('VIS18', '2019-06-27 18:18:00','LISBO','A'),
('VIS19', '2019-06-28 18:19:00','LISBO','A'),
('VIS20', '2019-06-30 18:20:00','EVORA','D'),
('VIS21', '2019-07-01 18:21:00','EVORA','D'),
('VIS22', '2019-07-04 18:30:00','EVORA','D'),
('VIS23', '2019-07-04 18:31:00','EVORA','B'),
('VIS24', '2019-07-06 18:40:00','EVORA','K'),
('VIS25', '2019-07-12 18:50:00','EVORA','G'),
('VIS26', '2019-07-15 18:00:00','PORTO','C'),
('VIS27', '2019-07-18 18:00:00','PORTO','C'),
('VIS28', '2019-07-25 18:00:00','PORTO','B'),
('VIS29', '2019-07-30 18:00:00','PORTO','M');
And I want the top two per month. The expected result should be:
month product count
2019-05 A 2
2019-05 B 2
2019-06 A 5
2019-06 D 3
2019-07 C 2
2019-07 D 2
But I'm not quite sure how to group by month. Please, any help will be greatly appreciated.
First, you can use to_char(the_date,'YYYY-MM') to get the year and month in the right format.
Next, you can use count(*) to group by the month and product, and row_number() to give a sequence number to each row in the groups.
SELECT to_char(the_date,'YYYY-MM') as month,
the_product as product,
count(*) as p_count,
row_number() over (partition by to_char(the_date,'YYYY-MM') order by count(*) desc) as seq
FROM my_table
group by month, product
Last, you can wrap that in an outer query to select just the columns and rows that you want.
SELECT month, product, p_count as count
FROM (
SELECT to_char(the_date,'YYYY-MM') as month,
the_product as product,
count(*) as p_count,
row_number() over (partition by to_char(the_date,'YYYY-MM') order by count(*) desc) as seq
FROM my_table
group by month, product
) as foo
where foo.seq <= 2;
You can use aggregation and window functions:
select mp.*
from (select date_trunc('month', the_date) as yyyymm,
the_product, count(*) as cnt,
row_number() over (partition by date_trunc('month', the_date) order by count(*) desc) as seqnum
from my_table
group by yyyymm, the_product
) mp
where seqnum <= 2;
In postgresql, I believe you can extract every parts of the timestamp using the Extract function.
e.g.:
SELECT the_date, EXTRACT(MONTH from the_date) as MONTH
the_date
MONTH
'2019-08-05'
08
that said, you can then group by Product, then Month, and Select the TOP 2
SELECT EXTRACT(MONTH from the_date) as month, the_product, count (*) FROM my_table
GROUP BY EXTRACT(MONTH from the_date), the_product
ORDER BY count(*)
LIMIT 2
There might be some optimization to do since I don't have a Database to test the query, but it might give you a good start

Calculate average days between orders The last three records tsql

I trying to take an average per customer, but you're not grouping by customer.
I would like to calculate the average days between several order dates from a table called invoice. For each BusinessPartnerID, what is the average days between orders i want average days last three records orders .
I got the average of all order for each user but need days last three records orders
The sample table is as below
;WITH temp (avg,invoiceid,carname,carid,fullname,mobail)
AS
(
SELECT AvgLag = AVG(Lag) , Lagged.idinvoice,
Lagged.carname ,
Lagged.carid ,Lagged.fullname,Lagged.mobail
FROM
(
SELECT
(car2.Name) as carname ,
(car2.id) as carid ,( busin.Name) as fullname, ( busin.Mobile) as mobail , INV.Id as idinvoice , Lag = CONVERT(int, DATEDIFF(DAY, LAG(Date,1)
OVER (PARTITION BY car2.Id ORDER BY Date ), Date))
FROM [dbo].[Invoice] AS INV
JOIN [dbo].[InvoiceItem] AS INITEM on INV.Id=INITEM.Invoiceid
JOIN [dbo].[BusinessPartner] as busin on busin.Id=INV.BuyerId and Type=5
JOIN [dbo].[Product] as pt on pt.Id=INITEM.ProductId and INITEM.ProductId is not null and pt.ProductTypeId=3
JOIN [dbo].[Car] as car2 on car2.id=INv.BusinessPartnerCarId
) AS Lagged
GROUP BY
Lagged.carname,
Lagged.carid,Lagged.fullname,Lagged.mobail, Lagged.idinvoice
-- order by Lagged.fullname
)
SELECT * FROM temp where avg is not null order by avg
I don't really see how your query relate to your question. Starting from a table called invoice that has columns businesspartnerid, and date, here is how you would take the average of the day difference between the last 3 invoices of each business partner:
select businesspartnerid,
avg(1.0 * datediff(
day,
lag(date) over(partition by businesspartnerid order by date),
date
) avg_diff_day
from (
select i.*,
row_number() over(partiton by businesspartnerid order by date desc) rn
from invoice i
) i
where rn <= 3
group by businesspartnerid
Note that 3 rows gives you 2 intervals only, that will be averaged.

How to change rank according to data changes

I want to have a window function to rank the month according to their values. So in this example, 2018-12 be rank 1, 2019-01 be 2, etc.
And I also want the rank counter to reset after it goes to a new cohort, in this case, cohort 2, the rank should start from 1 again, and the pattern will be similar to cohort 1
SELECT *,
rank() over (partition by cohort, month order by month asc)
FROM (
SELECT 1 as cohort, id, date_trunc('month',start_date) as month
FROM _analysis.terms
WHERE holiday=FALSE and id >= 125
UNION SELECT 2, id, date_trunc('month', start_date) FROM _analysis.terms
WHERE holiday=FALSE and id >= 126
ORDER BY cohort, id, month
)
ORDER BY cohort, id, month
This may help
USE AdventureWorks2014
GO
SELECT SalesOrderID,OrderDate
,DENSE_RANK() OVER(ORDER BY MONTH(OrderDate) ASC) [Rank]
FROM [Sales].[SalesOrderHeader]

Active customers for each day who were active in last 30 days

I have a BQ table, user_events that looks like the following:
event_date | user_id | event_type
Data is for Millions of users, for different event dates.
I want to write a query that will give me a list of users for every day who were active in last 30 days.
This gives me total unique users on only that day; I can't get it to give me the last 30 for each date. Help is appreciated.
SELECT
user_id,
event_date
FROM
[TableA]
WHERE
1=1
AND user_id IS NOT NULL
AND event_date >= DATE_ADD(CURRENT_TIMESTAMP(), -30, 'DAY')
GROUP BY
1,
2
ORDER BY
2 DESC
Below is for BigQuery Standard SQL and has few assumption about your case:
there is only one row per date per user
user is considered active in last 30 days if user has at least 5 (sure can be any number - even just 1) entries/rows within those 30 days
If above make sense - see below
#standardSQL
SELECT
user_id, event_date
FROM (
SELECT
user_id, event_date,
(COUNT(1)
OVER(PARTITION BY user_id
ORDER BY UNIX_DATE(event_date)
RANGE BETWEEN 30 PRECEDING AND 1 PRECEDING)
) >= 5 AS activity
FROM `yourTable`
)
WHERE activity
GROUP BY user_id, event_date
-- ORDER BY event_date
If above assumption #1 is not correct - you can just simple add pre-grouping as a sub-select
#standardSQL
SELECT
user_id, event_date
FROM (
SELECT
user_id, event_date,
(COUNT(1)
OVER(PARTITION BY user_id
ORDER BY UNIX_DATE(event_date)
RANGE BETWEEN 30 PRECEDING AND 1 PRECEDING)
) >= 5 AS activity
FROM (
SELECT user_id, event_date
FROM `yourTable`
GROUP BY user_id, event_date
)
)
WHERE activity
GROUP BY user_id, event_date
-- ORDER BY event_date
UPDATE
From comments: If user have any of the event_type IN ('view', 'conversion', 'productDetail', 'search') , they will be considered active. That means any kind of event triggered within the app
So, you can go with below, I think
#standardSQL
SELECT
user_id, event_date
FROM (
SELECT
user_id, event_date,
(COUNT(1)
OVER(PARTITION BY user_id
ORDER BY UNIX_DATE(event_date)
RANGE BETWEEN 30 PRECEDING AND 1 PRECEDING)
) >= 5 AS activity
FROM (
SELECT user_id, event_date
FROM `yourTable`
WHERE event_type IN ('view', 'conversion', 'productDetail', 'search')
GROUP BY user_id, event_date
)
)
WHERE activity
GROUP BY user_id, event_date
-- ORDER BY event_date

Retrieve recent 5 days forecast for each cities with latest issue date

I need to retrieve the recent 5 days forecast info for each cities.
My table looks like below
The real problem is with the issue date.
the city may contain several forecast info for the same date with distinct issue date.
I need to retrieve recent 5 records for each cities with latest issue date and group by forecast date
I have tried something like below but not giving the expected result
SELECT * FROM(
SELECT
ROW_NUMBER () OVER (PARTITION BY CITY_ID ORDER BY FORECAST_DATE DESC, ISSUE_DATE DESC) AS rn,
CITY_ID, FORECAST_DATE, ISSUE_DATE
FROM
FORECAST
GROUP BY FORECAST_DATE
) WHERE rn <= 5
Any suggestion or advice will be helpful
This will get the latest issued forecast per day over the most recent 5 days for each city:
SELECT *
FROM (
SELECT f.*,
DENSE_RANK() OVER ( PARTITION BY city_id ORDER BY forecast_date DESC )
AS forecast_rank,
ROW_NUMBER() OVER ( PARTITION BY city_id, forecast_date ORDER BY issue_date DESC )
AS issue_rn
FROM Forecast f
)
WHERE forecast_rank <= 5
AND issue_rn = 1;
Partition by works like group by but for the function only.
Try
with CTE as
(
select t1.*,
row_number() over (partition by city_id, forecast_date order by issue_date desc) as r_ord
from Forecast
)
select CTE.*
from CTE
where r_ord <= 5
Try this
SELECT * FROM(
SELECT
ROW_NUMBER () OVER (PARTITION BY CITY_ID, FORECAST_DATE order by ISSUE_DATE DESC) AS rn,
CITY_ID, FORECAST_DATE, ISSUE_DATE
FROM
FORECAST
) WHERE rn <= 5