Column name is ambiguous in bigquery - sql

I am implementing the following solution: https://stackoverflow.com/a/32663098/19903400
Here is the code which I copied from that accepted answer, and used my datasource instead:
SELECT
date,
SUM(CASE WHEN period = 7 THEN users END) as days_07,
SUM(CASE WHEN period = 14 THEN users END) as days_14,
SUM(CASE WHEN period = 30 THEN users END) as days_30
FROM (
SELECT
dates.date as date,
periods.period as period,
EXACT_COUNT_DISTINCT(activity.user_pseudo_id) as users
FROM `rayn-deen-app.analytics_317927526.events_*` as activity
CROSS JOIN (SELECT DATE_TRUNC(EXTRACT(DATE from TIMESTAMP_MICROS(event_timestamp)), DAY) as date FROM `rayn-deen-app.analytics_317927526.events_*` GROUP BY date) as dates
CROSS JOIN (SELECT period FROM (SELECT 7 as period),
(SELECT 14 as period),(SELECT 30 as period)) as periods
WHERE dates.date >= activity.date
AND INTEGER(FLOOR(DATEDIFF(dates.date, activity.date)/periods.period)) = 0
GROUP BY 1,2
)
GROUP BY date
ORDER BY date DESC
But I am getting the following error:
Column name period is ambiguous at [13:22]
So it seems here is the code snippet which is problematic:
CROSS JOIN (SELECT period FROM (SELECT 7 as period),
(SELECT 14 as period),(SELECT 30 as period)) as periods

If the goal is to have a fixed set of records, then you can replace this:
SELECT period FROM (SELECT 7 as period),
(SELECT 14 as period),(SELECT 30 as period)
with this:
SELECT period FROM (SELECT 7 as period UNION ALL
SELECT 14 UNION ALL
SELECT 30)

Related

How to count only the working days between two dates?

I have the following table called vacations, where the employee number is displayed along with the start and end date of their vacations:
id_employe
start
end
1001
2020-12-24
2021-01-04
What I am looking for is to visualize the amount of vacation days that each employee had, but separating them by employee number, month, year and number of days; without taking into account non-business days (Saturdays, Sundays and holidays).
I have the following query, which manages to omit Saturday and Sunday from the posting:
SELECT id_employee,
EXTRACT(YEAR FROM t.Date) AS year,
EXTRACT(MONTH FROM t.Date) AS month,
SUM(WEEKDAY(`Date`) < 5) AS days
FROM (SELECT v.id_employee,
DATE_ADD(v.start, interval s.seq - 1 DAY) AS Date
FROM vacations v CROSS JOIN seq_1_to_100 s
WHERE DATE_ADD(v.start, interval s.seq - 1 DAY) <= v.end
ORDER BY v.id_employee, v.start, s.seq ) t
GROUP BY id_employee, EXTRACT(YEAR_MONTH FROM t.Date);
My question is, how could I in addition to skipping the weekends, also skip the holidays? I suppose that I should establish another table where the dates of those holidays are stored, but how could my * query * be adapted to perform the comparison?
If we consider that the employee 1001 took his vacations from 2020-12-24 to 2021-01-04 and we take Christmas and New Years as holidays, we should get the following result:
id_employee
month
year
days
1001
12
2020
5
1001
1
2021
1
After you have created a table that stores the holiday dates, then you probably can do something like this:
SELECT id_employee,
EXTRACT(YEAR FROM t.Date) AS year,
EXTRACT(MONTH FROM t.Date) AS month,
SUM(CASE WHEN h.holiday_date IS NULL THEN WEEKDAY(`Date`) < 5 END) AS days
FROM (SELECT v.id_employee,
DATE_ADD(v.start, interval s.seq - 1 DAY) AS Date
FROM vacations v CROSS JOIN seq_1_to_100 s
WHERE DATE_ADD(v.start, interval s.seq - 1 DAY) <= v.end
ORDER BY v.id_employee, v.start, s.seq ) t
LEFT JOIN holidays h ON t.date=h.holiday_date
GROUP BY id_employee, EXTRACT(YEAR_MONTH FROM t.Date);
Assuming that the holidays table structure would be something like this:
CREATE TABLE holidays (
id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
holiday_date DATE,
holiday_description VARCHAR(255));
Then LEFT JOIN it to your current query and change the SUM() slightly by adding CASE expression to check. If the ON t.date=h.holiday_date in the left join matches, there will be result of field h.holiday_date, otherwise it will be NULL, hence only the CASE h.holiday_date WHEN IS NULL .. will be considered.
Demo fiddle
Adding this solution compatible with both MariaDB and MySQL version that supports common table expression:
WITH RECURSIVE cte AS
(SELECT id_employee, start, start lvdt, end FROM vacations
UNION ALL
SELECT id_employee, start, lvdt+INTERVAL 1 DAY, end FROM cte
WHERE lvdt+INTERVAL 1 DAY <=end)
SELECT id_employee,
YEAR(v.lvdt) AS year,
MONTH(v.lvdt) AS month,
SUM(CASE WHEN h.holiday_date IS NULL THEN WEEKDAY(v.lvdt) < 5 END) AS days
FROM cte v
LEFT JOIN holidays h
ON v.lvdt=h.holiday_date
GROUP BY id_employee,
YEAR(v.lvdt),
MONTH(v.lvdt);

Improve query to be less repetitive

Is there a way to improve this query? I see two problems here -
Repetitive code
Hard coded strings
The first CTE calculates count based on 18 months. The second CTE calculates count based on 12 months.
with month_18 as (
select proc_cd, count(*) as month_18 from
(
select distinct patient, proc_cd from
service
where proc_cd = '35'
and month_id >= (select month_id from annual)
and month_id <= '202009' --This month should be 18 months from the month above
and length(patient) > 1
) a
group by proc_cd
),
month_12 as
(
select proc_cd, count(*) as month_12 from
(
select distinct patient_id, proc_cd from
service
where proc_cd = '35'
and month_id >= '201910'
and month_id <= '202009' --This month should be 12 months from the month above
and length(patient) > 1
) a
group by proc_cd
)
select a.*, b.month_12 from
month_18 a
join month_12 b
on a.proc_cd = b.proc_cd
If I understand correctly, you can use conditional aggregation:
select proc_cd,
count(distinct patient) filter (where month_id >= (select month_id from annual) and month_id <= '202009') as month_18,
count(distinct patient) filter (where month_id >= '201910' and month_id <= '202009')
from service
where proc_cd = 35 and
length(patient) > 1
group by proc_cd;
If you have to deal with date arithmetic on the month ids, you can convert to a date, do the arithmetic and convert back to a string:
select to_char(to_date(month_id, 'YYYYMM') - interval '12 month', 'YYYYMM')
from (values ('202009')) v(month_id);

sql user retention calculation

I have a table records like this in Athena, one user one row in a month:
month, id
2020-05 1
2020-05 2
2020-05 5
2020-06 1
2020-06 5
2020-06 6
Need to calculate the percentage=( users come both prior month and current month )/(prior month total users).
Like in the above example, users come both in May and June 1,5 , May total user 3, this should calculate a percentage of 2/3*100
with monthly_mau AS
(SELECT month as mauMonth,
date_format(date_add('month',1,cast(concat(month,'-01') AS date)), '%Y-%m') AS nextMonth,
count(distinct userid) AS monthly_mau
FROM records
GROUP BY month
ORDER BY month),
retention_mau AS
(SELECT
month,
count(distinct useridLeft) AS retention_mau
FROM (
(SELECT
userid as useridLeft,month as monthLeft,
date_format(date_add('month',1,cast(concat(month,'-01') AS date)), '%Y-%m') AS nextMonth
FROM records ) AS prior
INNER JOIN
(SELECT
month ,
userid
FROM records ) AS current
ON
prior.useridLeft = current.userid
AND prior.nextMonth = current.month )
WHERE userid is not null
GROUP BY month
ORDER BY month )
SELECT *, cast(retention_mau AS double)/cast(monthly_mau AS double)*100 AS retention_mau_percentage
FROM monthly_mau as m
INNER JOIN monthly_retention_mau AS r
ON m.nextMonth = r.month
order by r.month
This gives me percentage as 100 which is not right. Any idea?
Hmmm . . . assuming you have one row per user per month, you can use window functions and conditional aggregation:
select month, count(*) as num_users,
sum(case when prev_month = dateadd('month', -1, month) then 1 else 0 end) as both_months
from (select r.*,
cast(concat(month, '-01') AS date) as month_date,
lag(cast(concat(month, '-01') AS date)) over (partition by id order by month) as prev_month_date
from records r
) r
group by month;

BigQuery Monthly Active Users?

I'm currently working off a query from this post. That query is written in Legacy SQL and will not work in my environment. I've modified the query to use the modern SQL functions and updated the SELECT date as date to use timestamp_micros.
I should also mention that the rows I'm trying to select are coming in from Firebase Analytics.
My Query:
SELECT
FORMAT_TIMESTAMP('%Y-%m-%d', TIMESTAMP_MICROS(event.timestamp_micros)) as date,
SUM(CASE WHEN period = 7 THEN users END) as days_07,
SUM(CASE WHEN period = 14 THEN users END) as days_14,
SUM(CASE WHEN period = 30 THEN users END) as days_30
FROM (
SELECT
FORMAT_TIMESTAMP('%Y-%m-%d', TIMESTAMP_MICROS(event.timestamp_micros)) as date,
periods.period as period,
COUNT(DISTINCT user_dim.app_info.app_instance_id) as users
FROM `com_sidearm_fanapp_uiowa_IOS.*` as activity
CROSS JOIN
UNNEST(event_dim) as event
CROSS JOIN (
SELECT
FORMAT_TIMESTAMP('%Y-%m-%d', TIMESTAMP_MICROS(event.timestamp_micros)) as date
FROM `com_sidearm_fanapp_uiowa_IOS.*`
CROSS JOIN
UNNEST(event_dim) as event
GROUP BY event.timestamp_micros
) as dates
CROSS JOIN (
SELECT
period
FROM
(
SELECT 7 as period
UNION ALL
SELECT 14 as period
UNION ALL
SELECT 30 as period
)
) as periods
WHERE
dates.date >= activity.date
AND
SAFE_CAST(FLOOR(TIMESTAMP_DIFF(dates.date, activity.date, DAY)/periods.period) AS INT64) = 0
GROUP BY 1,2
)
CROSS JOIN
UNNEST(event_dim) as event
GROUP BY date
ORDER BY date DESC
Column name period is ambiguous at [24:13] error.
to fix this particular error - you should fix below
CROSS JOIN (
SELECT
period
FROM
(SELECT 7 as period),
(SELECT 14 as period),
(SELECT 30 as period)
) as periods
so it should look like:
CROSS JOIN (
SELECT
period
FROM
(SELECT 7 as period UNION ALL
SELECT 14 as period UNION ALL
SELECT 30 as period)
) as periods
Answer on your updated question
Try below. I didn't have chance to test it but hope it can help you fix your query
SELECT
date,
SUM(CASE WHEN period = 7 THEN users END) as days_07,
SUM(CASE WHEN period = 14 THEN users END) as days_14,
SUM(CASE WHEN period = 30 THEN users END) as days_30
FROM (
SELECT
activity.date as date,
periods.period as period,
COUNT(DISTINCT user) as users
FROM (
SELECT
event.timestamp_micros as date,
user_dim.app_info.app_instance_id as user
FROM `yourTable` CROSS JOIN UNNEST(event_dim) as event
) as activity
CROSS JOIN (
SELECT
event.timestamp_micros as date
FROM `yourTable` CROSS JOIN UNNEST(event_dim) as event
GROUP BY event.timestamp_micros
) as dates
CROSS JOIN (
SELECT period
FROM
(SELECT 7 as period UNION ALL
SELECT 14 as period UNION ALL
SELECT 30 as period)
) as periods
WHERE dates.date >= activity.date
AND SAFE_CAST(FLOOR(TIMESTAMP_DIFF(TIMESTAMP_MICROS(dates.date), TIMESTAMP_MICROS(activity.date), DAY)/periods.period) AS INT64) = 0
GROUP BY 1,2
)
GROUP BY date
ORDER BY date DESC

How to get the monthly 7-day active users?

In my database I have two fields that are used to identify a user, timestamp and instance_id. I want to be able to get the monthly 7-day active users from this data. I have tried the following query but it just returns the same timestamp and 1 for every row.
SELECT
FORMAT_TIMESTAMP('%Y-%m-%d', TIMESTAMP_MICROS(date)) as target,
SUM(CASE WHEN period = 7 THEN users END) as days_07
# SUM(CASE WHEN period = 14 THEN users END) as days_14,
# SUM(CASE WHEN period = 30 THEN users END) as days_30
FROM (
SELECT
activity.date as date,
periods.period as period,
COUNT(DISTINCT user) as users
FROM (
SELECT
event.timestamp_micros as date,
user_dim.app_info.app_instance_id as user
FROM `hidden.*`
CROSS JOIN
UNNEST(event_dim) as event
) as activity
CROSS JOIN (
SELECT
event.timestamp_micros as date
FROM `hidden.*`
CROSS JOIN
UNNEST(event_dim) as event
GROUP BY event.timestamp_micros
) as dates
CROSS JOIN (
SELECT period
FROM
(
SELECT 7 as period
# UNION ALL
# SELECT 14 as period
# UNION ALL
# SELECT 30 as period
)
) as periods
WHERE
dates.date >= activity.date
AND
SAFE_CAST(FLOOR(TIMESTAMP_DIFF(TIMESTAMP_MICROS(dates.date), TIMESTAMP_MICROS(activity.date), DAY)/periods.period) AS INT64) = 0
GROUP BY 1,2
)
GROUP BY date
ORDER BY date DESC
I'm not too sure where to go from here and it's quite challenging to me because I'm not the best with SQL. Any assistance at all would be great. Thanks!
I should also mention that these queries are going to be run within BigQuery and the data is being exported to BigQuery from Firebase.
Try below
SELECT
DATE,
SUM(CASE WHEN period = 7 THEN users END) AS days_07,
SUM(CASE WHEN period = 14 THEN users END) AS days_14,
SUM(CASE WHEN period = 30 THEN users END) AS days_30
FROM (
SELECT
activity.date AS DATE,
periods.period AS period,
COUNT(DISTINCT user) AS users
FROM (
SELECT DISTINCT
DATE(TIMESTAMP_MICROS(event.timestamp_micros)) AS DATE,
user_dim.app_info.app_instance_id AS user
FROM `firebase-analytics-sample-data.android_dataset.app_events_20160607`
CROSS JOIN UNNEST(event_dim) AS event
) AS activity
CROSS JOIN (
SELECT DATE(TIMESTAMP_MICROS(event.timestamp_micros)) AS DATE
FROM `firebase-analytics-sample-data.android_dataset.app_events_20160607`
CROSS JOIN UNNEST(event_dim) AS event
GROUP BY 1
) AS dates
CROSS JOIN (
SELECT period FROM
(SELECT 7 AS period UNION ALL
SELECT 14 AS period UNION ALL
SELECT 30 AS period)
) AS periods
WHERE dates.date >= activity.date
AND SAFE_CAST(FLOOR(DATE_DIFF(dates.date, activity.date, DAY)/periods.period) AS INT64) = 0
GROUP BY 1,2
)
GROUP BY DATE
ORDER BY DATE DESC