CASE AND WHEN SQL - sql

I have transactional data of customers' purchase. I tried to select customer_id from the last 1 month and calculate recency as the average day customers come to purchase (AVG(gap))
SELECT
customer_id,
(
CASE WHEN day::DATE<= '2015-05-01'::DATE AND day::DATE > '2015-05-01'::DATE - INTERVAL '1 month'
THEN
(
SELECT
AVG(gap)
FROM
(
SELECT
customer_id,
( day- LAG(day) OVER ( PARTITION BY customer_id ORDER BY day ) ) AS gap
FROM
baskets
JOIN
basket_lines
USING
( basket_id )
GROUP BY 1
) a
) b
ELSE 0
) AS A
FROM
baskets
JOIN
basket_lines
USING
(basket_id)
GROUP BY
1;
However, I have an error like `
ERROR: syntax error at or near "b"
LINE 45: GROUP BY 1)a)b ELSE 0) AS A
^
Does it mean I can not use subquery after THEN statement?

A subquery in the THEN clause does not take an alias. Also, you must end your CASE expression with END:
SELECT
customer_id,
(CASE WHEN day::DATE<= '2015-05-01'::DATE AND
day::DATE > '2015-05-01'::DATE - INTERVAL '1 month'
THEN
(SELECT AVG(gap) FROM (
SELECT customer_id,
(day- LAG(day) OVER (PARTITION BY customer_id ORDER BY day)) as gap
FROM baskets
JOIN basket_lines
USING (basket_id)
GROUP BY 1) a) ELSE 0 END) AS A
FROM baskets
JOIN basket_lines
USING (basket_id)
GROUP BY 1;
But you have a correlated subquery in your select statement. This is probably not optimal, and we can likely rewrite your query using a join.
I propose the following refactor:
WITH cte AS (
SELECT
customer_id,
(day- LAG(day) OVER (PARTITION BY customer_id ORDER BY day)) as gap
FROM baskets
INNER JOIN basket_lines
USING (basket_id)
WHERE day::DATE<= '2015-05-01'::DATE AND
day::DATE > '2015-05-01'::DATE - INTERVAL '1 month'
)
SELECT
customer_id,
AVG(gap) AS cust_avg
FROM cte
GROUP BY
customer_id;

Related

SQL Get last 7 days from event date

The best way to explain what I need is showing, so, here it is:
Currently I have this query
select
date_
,count(*) as count_
from table
group by date_
which returns me the following database
Now I need to get a new column, that shows me the count off all the previous 7 days, considering the row date_.
So, if the row is from day 29/06, I have to count all ocurrencies of that day ( my query is already doing it) and get all ocurrencies from day 22/06 to 29/06
The result should be something like this:
If you have values for all dates, without gaps, then you can use window functions with a rows frame:
select
date,
count(*) cnt
sum(count(*)) over(order by date rows between 7 preceding and current row) cnt_d7
from mytable
group by date
order by date
you can try something like this:
select
date_,
count(*) as count_,
(select count(*)
from table as b
where b.date_ <= a.date_ and b.date_ > a.date - interval '7 days'
) as count7days_
from table as a
group by date_
If you have gaps, you can do a more complicated solution where you add and subtract the values:
with t as (
select date_, count(*) as count_
from table
group by date_
union all
select date_ + interval '8 day', -count(*) as count_
from table
group by date_
)
select date_,
sum(sum(count_)) over (order by date_ rows between unbounded preceding and current row) - sum(count_)
from t;
The - sum(count_) is because you do not seem to want the current day in the cumulated amount.
You can also use the nasty self-join approach . . . which should be okay for 7 days:
with t as (
select date_, count(*) as count_
from table
group by date_
)
select t.date_, t.count_, sum(tprev.count_)
from t left join
t tprev
on tprev.date_ >= t.date_ - interval '7 day' and
tprev.date_ < t.date_
group by t.date_, t.count_;
The performance will get worse and worse as "7" gets bigger.
Try with subquery for the new column:
select
table.date_ as groupdate,
count(table.date_) as date_count,
(select count(table.date_)
from table
where table.date_ <= groupdate and table.date_ >= groupdate - interval '7 day'
) as total7
from table
group by groupdate
order by groupdate

Windows functions orderen by date when some dates doesn't exist

Suppose this example query:
select
id
, date
, sum(var) over (partition by id order by date rows 30 preceding) as roll_sum
from tab
When some dates are not present on date column the window will not consider the unexistent dates. How could i make this windowns aggregation including these unexistent dates?
Many thanks!
You can join a sequence containing all dates from a desired interval.
select
*
from (
select
d.date,
q.id,
q.roll_sum
from unnest(sequence(date '2000-01-01', date '2030-12-31')) d
left join ( your_query ) q on q.date = d.date
) v
where v.date > (select min(my_date) from tab2)
and v.date < (select max(my_date) from tab2)
In standard SQL, you would typically use a window range specification, like:
select
id,
date,
sum(var) over (
partition by id
order by date
range interval '30' day preceding
) as roll_sum
from tab
However I am unsure that Presto supports this syntax. You can resort a correlated subquery instead:
select
id,
date,
(
select sum(var)
from tab t1
where
t1.id = t.id
and t1.date >= t.date - interval '30' day
and t1.date <= t.date
) roll_sum
from tab t
I don't think Presto support window functions with interval ranges. Alas. There is an old fashioned way to doing this, by counting "ins" and "outs" of values:
with t as (
select id, date, var, 1 as is_orig
from t
union all
select id, date + interval '30 day', -var, 0
from t
)
select id.*
from (select id, date, sum(var) over (partition by id order by date) as running_30,
sum(is_org) as is_orig
from t
group by id, date
) id
where is_orig > 0

How to join 2 subqueries

I'm trying to join 2 subqueries within the same table with this query:
SELECT COUNT(phone) as users, DATE_TRUNC('month', somedate) as date_month from
(SELECT phone, MIN (created_at) as somedate
FROM analytics.orders
where status = 'done'
GROUP BY phone) as s1
GROUP BY date_month
INNER JOIN
(SELECT value, cohort FROM
(SELECT SUM (amount) as value, DATE_TRUNC('month', created_at) as cohort
FROM analytics.orders
where status = 'done'
GROUP BY cohort, (SELECT SUM (amount) from analytics.orders )
ORDER BY cohort) as s2) as s3
ON s1.date_month=s3.cohort
But I am getting this error:
syntax error at or near "INNER" LINE 7: INNER JOIN ^
I guess that something is wrong with inner naming but I can't understand what is exactly wrong.
Don't think you actually need to join or union them.
It's all using the same table.
Rather combine the queries.
SELECT
DATE_TRUNC('month', created_at) as date_month,
COUNT(DISTINCT phone) as unique_phones,
SUM(amount) as total_amount
FROM analytics.orders
WHERE status = 'done'
GROUP BY DATE_TRUNC('month', created_at)
ORDER BY 1;

LEFT OUTER JOIN Error creating a subquery on bigquery

I'm trying to eval MAL, WAL and DAU from a event table on my bq...
I create a query find DAU and with him find WAU and MAU,
but it does not work, i received this error:
LEFT OUTER JOIN cannot be used without a condition that is an equality of fields from both sides of the join.
It's my query
WITH dau AS (
SELECT
date,
COUNT(DISTINCT(events.device_id)) as DAU_explorer
FROM `workspace.event_table` as events
GROUP BY 1
)
SELECT
date,
dau,
(SELECT
COUNT(DISTINCT(device_id))
FROM `workspace.event_table` as events
WHERE events.date BETWEEN DATE_ADD(dau.date, INTERVAL -30 DAY) AND dau.date
) AS mau,
(SELECT
COUNT(DISTINCT(device_id)) as DAU_explorer
FROM `workspace.event_table` as events
WHERE events.date BETWEEN DATE_ADD(dau.date, INTERVAL -7 DAY) AND dau.date
) AS wau
FROM dau
Where is my error? Is not possible run subqueries like this on bq?
Try this instead:
WITH data AS (
SELECT DATE(creation_date) date, owner_user_id device_id
FROM `bigquery-public-data.stackoverflow.posts_questions`
WHERE EXTRACT(YEAR FROM creation_date)=2017
)
#standardSQL
SELECT DATE_SUB(date, INTERVAL i DAY) date_grp
, COUNT(DISTINCT IF(i<31,device_id,null)) unique_30_day_users
, COUNT(DISTINCT IF(i<8,device_id,null)) unique_7_day_users
FROM `data`, UNNEST(GENERATE_ARRAY(1, 30)) i
GROUP BY 1
ORDER BY date_grp
LIMIT 100
OFFSET 30
And if you are looking for a more efficient solution, try approximate results.

Get another column from sum-sub-select

I'm selecting something from a sub-select, which in turn gives me a list of sums. Now I want to select the base_unit column, which contains the unit of measurement. I can't seem to add base_unit to the sub-select because then it doesn't work with the GROUP BY statement.
SELECT to_char(a.pow * f_unit_converter(base_unit, '[W]'), '000.00')
FROM (
SELECT sum (time_value) AS pow
FROM v_value_quarter_hour
WHERE
mp_id IN (SELECT mp_id FROM t_mp WHERE mp_name = 'AC') AND
(now() - time_stamp < '5 day')
GROUP BY time_stamp
ORDER BY time_stamp DESC
) a
LIMIT 1
Where/how can I additionally select the base_unit from the t_mp Table for each of those sums, so that I can pass it to the f_unit_converter function?
Thanks a lot,
MrB
SELECT to_char(a.pow * f_unit_converter(a.base_unit, '[W]'), '000.00')
FROM (
SELECT sum (time_value) AS pow, t_mp.base_unit
FROM v_value_quarter_hour
inner join t_mp on (v_value_quarter_hour.mp_id = t_mp.mp_id)
WHERE
t_mp.mp_name = 'AC' AND
(now() - time_stamp < '5 day')
GROUP BY time_stamp, base_unit
ORDER BY time_stamp DESC
) a
LIMIT 1
Assuming that all your selected rows have the same base_unit, you should be able to add it both to the SELECT and the GROUP BY of your sub-query.
Use an INNER JOIN instead of an IN. Something like this
SELECT to_char(a.pow * f_unit_converter(base_unit, '[W]'), '000.00') FROM (
SELECT sum (time_value), base_unit AS pow
FROM v_value_quarter_hour
INNER JOIN t_mp ON v_value_quarter_hour.mp_id = t_mp.mp_id
WHERE mp_name = 'AC' AND
now() - time_stamp < '5 day'
GROUP BY time_stamp, base_unit
ORDER BY time_stamp DESC ) a LIMIT 1