Append select with missing month and year values - sql

I have SELECT:
SELECT month, year, ROUND(AVG(q_overall) OVER (rows BETWEEN 10000 preceding and current row),2) as avg
FROM (
SELECT EXTRACT(Month FROM date) as month, EXTRACT(Year FROM date) as year, ROUND(AVG(q_overall),1) as q_overall
FROM fb_parsed
WHERE business_id = 1
GROUP BY year, month
ORDER BY year, month) a
output:
month year avg
-----------------
12 2012 5
1 2013 4.5
2 2013 4.1
4 2013 4.8
5 2013 4.7
And I have to append this table with missing values (in this example with 3-rd month in 2013 year). The avg must be same as in previous row, that means I need to append this table with:
3 2013 4.1
Can I do this with SELF JOINS and generate_series, or with some UNION select?

You can simplify your select. It doesn't need a subquery:
SELECT EXTRACT(Month FROM date) as month,
EXTRACT(Year FROM date) as year,
ROUND(AVG(q_overall), 1) as q_overall,
ROUND(AVG(AVG(q_overall)) OVER (rows BETWEEN 10000 preceding and current row), 2)
FROM fb_parsed
WHERE business_id = 1
GROUP BY year, month;
The windows function needs an order by. I assume you really intend:
SELECT EXTRACT(Month FROM date) as month,
EXTRACT(Year FROM date) as year,
ROUND(AVG(q_overall), 1) as q_overall,
ROUND(AVG(AVG(q_overall)) OVER (ORDER BY year, month)), 2)
FROM fb_parsed
WHERE business_id = 1
GROUP BY year, month;
Then, to fill in the values you can use generate_series():
SELECT EXTRACT(Month FROM ym.date) as month,
EXTRACT(Year FROM ym.date) as year,
ROUND(AVG(AVG(q_overall)) OVER (ORDER BY year, month)), 2)
FROM (SELECT generate_series(date_trunc('month', min(date)),
date_trunc('month', max(date)),
interval '1 month') as date
FROM fb_parsed
) ym LEFT JOIN
fb_parsed p
ON EXTRACT(year FROM ym.date) = EXTRACT(year FROM p.date) AND
EXTRACT(month FROM ym.date) = EXTRACT(month FROM p.date) AND
p.business_id = 1
GROUP BY year, month;
I think this will do what you want.

Final query:
SELECT EXTRACT(Month FROM ym.date) as month,
EXTRACT(Year FROM ym.date) as year,
ROUND(AVG(AVG(q_overall)) OVER (ORDER BY EXTRACT(Year FROM ym.date), EXTRACT(Month FROM ym.date)), 2)
FROM
(SELECT generate_series(date_trunc('month', min(date)),
date_trunc('month', max(date)),
interval '1 month') as date
FROM fb_parsed WHERE business_id = 1 AND site = 'facebook')
ym LEFT JOIN
fb_parsed p
ON EXTRACT(year FROM ym.date) = EXTRACT(year FROM p.date) AND
EXTRACT(month FROM ym.date) = EXTRACT(month FROM p.date) AND
p.business_id = 1 AND site = 'facebook'
GROUP BY year, month;

Can I do this with SELF JOINS and generate_series?
Yep, you're close, but your current query does a Cumulative Average. The tricky part is the fill the gaps with the previous value (If PostgreSQL supported the IGNORE NULLS option of LAST_VALUE this would be easier...)
SELECT month,
year,
MAX(q_overall) -- assign the value to all rows within the same group
OVER (PARTITION BY grp)
FROM
(
SELECT all_months.month, all_months.year, p.q_overall,
-- assign a new group number whenever there's a value in q_overall
SUM(CASE WHEN q_overall IS NULL THEN 0 ELSE 1 END)
OVER (ORDER BY all_months.month, all_months.year
ROWS UNBOUNDED PRECEDING) AS grp
FROM
( -- create all months with min and max date
SELECT generate_series(date_trunc('month', min(date)),
date_trunc('month', max(date)),
interval '1 month') as date
FROM fb_parsed
) AS all_months
LEFT JOIN
( -- do the average per month calculation
SELECT EXTRACT(Month FROM date) as month,
EXTRACT(Year FROM date) as year,
ROUND(AVG(q_overall),1) as q_overall
FROM fb_parsed
WHERE business_id = 1
GROUP BY year, month
) AS p
ON EXTRACT(year FROM ym.date) = all_months.month
AND EXTRACT(month FROM ym.date) = all_months.year
) AS dt
Edit:
Oops, this was overly complicated, the question asked for a Cumulative Average and then NULLs will not change the result and there's no need to fill the gaps

Related

Monthly total count of overall users in sql

I need the monthly new user count and the total count of users until that month end in Oracle SQL. Can calculate monthly new users, but struggling to count total for that month.
SELECT COUNT(*) AS REGISTERED_USERS, EXTRACT(MONTH FROM ADD_DATE) AS MN, EXTRACT(YEAR FROM ADD_DATE) AS YR
FROM USERS
WHERE EXTRACT(YEAR FROM ADD_DATE) = 2020
GROUP BY EXTRACT(MONTH FROM ADD_DATE), EXTRACT(YEAR FROM ADD_DATE)
ORDER BY EXTRACT(MONTH FROM ADD_DATE);
You want a running total. You get this with SUM OVER.
SELECT yr, mn, new_users, total_users
FROM
(
SELECT
EXTRACT(YEAR FROM add_date) AS yr,
EXTRACT(MONTH FROM add_date) AS mn,
COUNT(*) AS new_users,
SUM(COUNT(*)) OVER (ORDER BY EXTRACT(YEAR FROM add_date), EXTRACT(MONTH FROM add_date)) AS total_users
FROM users
GROUP BY EXTRACT(YEAR FROM add_date), EXTRACT(MONTH FROM add_date)
)
WHERE yr = 2020
ORDER BY yr, mn;
You can use the SUM analytical function with COUNT aggregate function and running the analytical function on add_date by month as follows:
SELECT EXTRACT(YEAR FROM DT) yr, EXTRACT(MONTH FROM DT) mn, new_users, total_users
FROM
(
SELECT
TRUNC(ADD_DATE, 'MON') AS DT,
COUNT(*) AS new_users,
SUM(COUNT(*)) OVER (ORDER BY TRUNC(ADD_DATE, 'MON')) AS total_users
FROM users
GROUP BY TRUNC(ADD_DATE, 'MON')
)
WHERE EXTRACT(YEAR FROM DT) = 2020
ORDER BY YR, MN;

PostgreSQL: Simplifying a SQL query into a shorter query

I have a table called 'daily_prices' where I have 'sale_date', 'last_sale_price', 'symbol' as columns.
I need to calculate how many times 'last_sale_price' has gone up compared to previous day's 'last_sale_price' in 10 weeks.
Currently I have my query like this for 2 weeks:
select count(*) as "timesUp", sum(last_sale_price-prev_price) as "dollarsUp", 'wk1' as "week"
from
(
select last_sale_price, LAG(last_sale_price, 1) OVER (ORDER BY sale_date) as prev_price
from daily_prices
where sale_date <= CAST('2020-09-18' AS DATE) AND sale_date >= CAST('2020-09-14' AS DATE)
and symbol='AAPL'
) nest
where last_sale_price > prev_price
UNION
select count(*) as "timesUp", sum(last_sale_price-prev_price) as "dollarsUp", 'wk2' as "week"
from
(
select last_sale_price, LAG(last_sale_price, 1) OVER (ORDER BY sale_date) as prev_price
from daily_prices
where sale_date <= CAST('2020-09-11' AS DATE) AND sale_date >= CAST('2020-09-07' AS DATE)
and symbol='AAPL'
) nest
where last_sale_price > prev_price
I'm using 'UNION' to combine the weekly data. But as the number of weeks increase the query is going to be huge.
Is there a simpler way to write this query?
Any help is much appreciated. Thanks in advance.
you can extract week from sale_date. then apply group by on the upper query
select EXTRACT(year from sale_date) YEAR, EXTRACT('week' FROM sale_date) week, count(*) as "timesUp", sum(last_sale_price-prev_price) as "dollarsUp"
from (
select
sale_date,
last_sale_price,
LAG(last_sale_price, 1) OVER (ORDER BY sale_date) as prev_price
from daily_prices
where symbol='AAPL'
)
where last_sale_price > prev_price
group by EXTRACT(year from sale_date), EXTRACT('week' FROM sale_date)
to extract only weekdays you can add this filter
EXTRACT(dow FROM sale_date) in (1,2,3,4,5)
PS: make sure that monday is first day of the week. In some countries sunday is the first day of the week
You can filter on the last 8 weeks in the where clause, then group by week and do conditional aggregation:
select extract(year from sale_date) yyyy, extract(week from saledate) ww,
sum(last_sale_price - lag_last_sale_price) filter(where lag_last_sale_price > last_sale_price) sum_dollars_up,
count(*) filter(where lag_last_sale_price > last_sale_price) cnt_dollars_up
from (
select dp.*,
lag(last_sale_price) over(partition by extract(year from sale_date), extract(week from saledate) order by sale_date) lag_last_sale_price
from daily_price
where symbol = 'AAPL'
and sale_date >= date_trunc('week', current_date) - '8 week'::interval
) dp
group by 1, 2
Notes:
I am asssuming that you don't want to compare the first price of a week to the last price of the previous week; if you do, then just remove the partition by clause from the over() clause of lag()
this dynamically computes the date as of 8 (entire) weeks ago
if there is no price increase during a whole week, the query still gives you a row, with 0 as sum_dollars_up and cnt_dollars_up

Only sum values for groups with more than one row

I have this query:
SELECT extract(year from date1), extract(month from date1), spending
FROM ( SELECT *, COUNT(*) OVER(PARTITION BY CONCAT(extract(year FROM date1), extract(month FROM date1))) N
FROM table) as A
WHERE N > 1
GROUP BY date1
ORDER BY date1 ASC;
With this result:
I need to sum the field spending only when there are more than one row with equla year and month. Desired result:
year month spending
---- ----- --------
2015 1 5424
2016 1 605886
2016 5 xxxxxx
.... .. ......
ok, I found the solution: HAVING :
SELECT extract(year from date1), extract(month from date1), spending
FROM table
GROUP BY extract(month from date1)), extract(year from date1), extract(month from date1)
HAVING count (CONCAT(extract(year from date1), extract(month from date1))) > 1
ORDER BY extract(year from date1), extract(month from date1) ASC;
in case it helps someone.
This can be simpler and faster using date_trunc() and some simplifications:
SELECT date_trunc('month', date1)::date AS month
, sum(spending) AS sum_spending
, count(*) AS count_rows -- optional addition
FROM table
GROUP BY 1
HAVING count(*) > 1
ORDER BY 1;
Only returns the sum of spendings for months with more than one row.
If you need to display separate year and month numbers, you could use above query in a subquery, still faster:
SELECT extract(year FROM month)::int AS year
, extract(month FROM month)::int AS month
, sum_spending, count_rows
FROM (
SELECT date_trunc('month', date1)::date AS month
, sum(spending) AS sum_spending
, count(*) AS count_rows -- optional
FROM table
GROUP BY 1
HAVING count(*) > 1
ORDER BY 1
) sub;
Or extract the numbers directly like in your solution, but just use the much faster count(*) in the HAVING clause:
SELECT extract(year FROM date1)::int AS year
, extract(month FROM date1)::int AS month
, sum(spending) AS sum_spending
, count(*) AS count_rows -- optional
FROM table
GROUP BY 1, 2
HAVING count(*) > 1
ORDER BY 1, 2;
1, 2 are (totally optional) positional references to shorten the syntax, so we don't have to repeat the expressions in the SELECT list. Example:
Select first row in each GROUP BY group?
The cast to integer (::int) is also optional. The generic return type of extract is double precision, but year and date can be cast to integer safely. Smaller, faster and more adequate.
try this
SELECT extract(year from date1), extract(month from date1), sum(spending)
FROM ( SELECT *, COUNT(*) OVER(PARTITION BY CONCAT(extract(year FROM date1), extract(month FROM date1))) N
FROM table) as A
WHERE N > 1
GROUP BY extract(year from date1),extract(month from date1)
ORDER BY extract(year from date1),extract(month from date1) ASC;

Postgres - Cohort analysis across months sequentially, not if exists in any later month

I'm doing a cohort analysis and can get the group of users to examine, then see whether they transacted in the months following on. But I want it like this:
Of that group in December, who transacted in Jan; of the Jan group from Dec, who transacted in Feb. Basically i'm tracking decay of the customer base
What I don't want is those that return in any month following Dec, which is this:
WITH start_sample AS (
SELECT
user_fk,
created_at AS start_sample_date
FROM transactions
WHERE created_at >= '2016-11-01' AND created_at < '2016-12-01'
GROUP BY user_fk,
start_sample_date),
start_sample_min AS (
SELECT
user_fk,
MIN(start_sample_date) AS first_transaction
FROM start_sample
GROUP BY user_fk
)
SELECT
DATE_TRUNC('month', created_at) AS transacting_month,
COUNT(DISTINCT user_fk)
FROM transactions
WHERE created_at >= '2016-11-01'
AND t.user_fk IN(SELECT user_fk FROM start_sample_min)
GROUP BY transacting_month
ORDER BY transacting_month;
Then I made a churn model to see if it would get what I need, but it doesn't:
WITH monthly_users AS (
SELECT
user_fk AS monthly_user_fk,
DATE_TRUNC('month', created_at) AS month
FROM transactions
WHERE created_at >= '2016-11-01' AND created_at < '2017-12-01'
GROUP BY monthly_user_fk, month
ORDER BY monthly_user_fk, month
),
lag_lead AS (
SELECT
monthly_user_fk,
month,
LAG(month,1) OVER (PARTITION BY monthly_user_fk ORDER BY month) AS lag,
LEAD(month,1) OVER (PARTITION BY monthly_user_fk ORDER BY month) AS lead
FROM monthly_users),
lag_lead_with_diffs AS (
SELECT
monthly_user_fk,
month,
lag AS previous_month,
lead AS next_month,
EXTRACT(EPOCH FROM (month - lag)/86400)::INT AS lag_size,
EXTRACT(EPOCH FROM (lead - month)/86400)::INT AS lead_size
FROM lag_lead
),
calculated AS (
SELECT
month,
CASE WHEN previous_month IS NULL THEN 'ACTIVATION'
WHEN lag_size <= 31 THEN 'ACTIVE'
WHEN lag_size > 31 THEN 'RETURN' END AS this_month_values,
CASE WHEN (lead_size > 31 OR lead_size IS NULL) THEN 'CHURN' ELSE NULL END AS next_month_churn,
COUNT(DISTINCT monthly_user_fk) AS c_d_users
FROM lag_lead_with_diffs
GROUP BY month, 2, 3
)
SELECT
month,
this_month_values,
SUM(c_d_users) AS distinct_users
FROM calculated
GROUP BY month, this_month_values
UNION
SELECT month + INTERVAL '1 month',
'CHURN',
SUM(c_d_users)
FROM calculated
WHERE next_month_churn IS NOT NULL
GROUP BY month + INTERVAL '1 month', 2
HAVING (EXTRACT(EPOCH FROM (month + INTERVAL '1 month'))) < 1512086400
ORDER BY month, this_month_values;
However this is not fixed at the initial group. The Active group rolls from month to month.
I understand that the above is likely more complicated than what i'm asking, but I can't seem to get my head around it
Thanks in advance
Perhaps this is what you are looking for:
with Monthly_Users as (
select user_fk
, date_trunc('month',created_at) as month
, (date_part('year', created_at) - 2016) * 12
+ date_part('month', created_at) - 11 as Months_Between
from transactions
where created_at between date '2016-11-01'
and date '2017-12-01'
group by user_fk, month, months_between
), t2 as (
select Monthly_Users.*
, count(*) over (partition by user_fk
order by month rows between unbounded preceding
and 1 preceding) prev_rec_cnt
from Monthly_Users
)
select month
, count(*)
from t2
where Months_Between = Prev_Rec_Cnt
group by month
order by month;
In this query the Monthly_Users CTE is just like yours, but adds a computation of the number of Months_Between the created_at date and your initial starting date. In the second Common Table Expression, I count the number of occurrences of each user_fk prior to the current months record. Finally in the output query I limit the results to only those records where the Months_Between value matches the Prev_Rec_Cnt value. Any missed months will cause the Prev_Rec_Cnt value to not match the Months_Between value, so you'll be able to see the fall off of user_fk values from month to month.

SQL Reactivation Revenue

I'm looking for a query that will Sum Reactivation Revenue from a given date on. Currently I have the following query;
SELECT advertisable, EXTRACT(YEAR from day), EXTRACT(MONTH from day), ROUND(SUM(cost)/1e6) FROM adcube dac
WHERE advertisable_eid IN
(SELECT advertisable FROM adcube dac
GROUP BY advertisable HAVING SUM(cost)/1e6 > 100)
GROUP BY advertisable, EXTRACT(YEAR from day), EXTRACT(MONTH from day)
ORDER BY advertisable, EXTRACT(YEAR from day), EXTRACT(MONTH from day)
From this i then export to excel and check accounts thay have stopped spending for 4 months and then reactivated. I then track the new revenue from the new reactivtion month.
Is it possible to get an SQL query to do this without need of Excel?
Thanks
Assuming the four months is actually present in the data, you can do this using window functions. You can find N things in a row by taking the difference between two row_numbers(). Here is the idea:
with t as (
SELECT advertisable, EXTRACT(YEAR from day) as yy, EXTRACT(MONTH from day) as mon,
ROUND(SUM(cost)/1e6) as val
FROM adcube dac
WHERE advertisable_eid IN (SELECT advertisable
FROM adcube dac
GROUP BY advertisable
HAVING SUM(cost)/1e6 > 100
)
GROUP BY advertisable, EXTRACT(YEAR from day), EXTRACT(MONTH from day)
)
select advertisable, min(yy * 10000 + mon) as yyyymm
from (select t.*,
(row_number() over (partition by advertisable order by yy, mon) -
row_number() over (partition by advertisable, val order by yy, mon)
) as grp
from t
)
group by advertisable, grp, val
having count(*) >= 4 and val = 0;