Count Distinct is less than Sum(Count Distinct)

Count Distinct is less than Sum(Count Distinct) - sql

I have two queries:
select COUNT(DISTINCT (CASE WHEN EVENT_NAME = 'event' THEN UPPER(user END)) AS SIGNUP_COUNT,
from table
WHERE date BETWEEN '2020-07-01' AND '2020-09-01'
and
with EVENTS_FILTERED_with_count as (
select *
, COUNT(DISTINCT (CASE WHEN EVENT_NAME = 'event' THEN UPPER(user END)) AS SIGNUP_COUNT
from table
group by 1)
SELECT sum(SIGNUP_COUNT) FROM EVENTS_FILTERED_with_count
WHERE date BETWEEN '2020-07-01' AND '2020-09-01'
The first query returns 2.5K rows as result, and the second one returns 3K rows.
Why would adding the group by make the result larger? I'm wondering if it has to do with NULL values.

Because the same user has multiple events, so the event is counted multiple times when counted at the user level.
It is hard to be more descriptive without sample data.

Related

BigQuery: iterating groups within a window of 28days before a start_date column using _TABLE_SUFFIX

I got a table like this:
group_id
start_date
end_date
19335
20220613
20220714
19527
20220620
20220719
19339
20220614
20220720
19436
20220616
20220715
20095
20220711
20220809
I am trying to retrieve data from another table that is partitioned, and data should be access with _TABLE_SUFFIX BETWEEN start_date AND end_date.
Each group_id contains different user_id within the period [start_date, end_date]. What I need is to retrieve data of users of a column/metric of the last 28D prior to the start_date of each group_id.
My idea is to:
Retrieve distinct user_id per group_id within the period [start_date, end_date]
Retrieve previous 28d metric data prior to the start date of each group_id
A snippet code on how to retrieve data from a single group_id is the following:
WITH users_per_group AS (
SELECT
users_metadata.user_id,
users_metadata.group_id,
FROM
`my_table_users_*` users_metadata
WHERE
_TABLE_SUFFIX BETWEEN '20220314' --start_date
AND '20220413' --end_date
AND experiment_id = 16709
GROUP BY
1,
2
)
SELECT
_TABLE_SUFFIX AS date,
user_id,
SUM(
COALESCE(metric, 0)
) AS metric,
FROM
users_per_group
JOIN `my_metric_table*` metric USING (user_id)
WHERE
_TABLE_SUFFIX BETWEEN FORMAT_TIMESTAMP(
'%Y%m%d',
TIMESTAMP_SUB(
PARSE_TIMESTAMP('%Y%m%d', '20220314'), --start_date
INTERVAL 28 DAY
)
) -- 28 days before it starts
AND FORMAT_TIMESTAMP(
'%Y%m%d',
TIMESTAMP_SUB(
PARSE_TIMESTAMP('%Y%m%d', '20220314'), --start_date
INTERVAL 1 DAY
)
) -- 1 day before it starts
GROUP BY
1,
2
ORDER BY
date ASC
Also, I want to avoid retrieving all data (considering all dates) from that metric, as the table is huge and it will take very long time to retrieve it.
Is there an easy way to retrieve the metric data of each user across groups and considering the previous 28 days to the start data of each group_id?

I can think of 2 approaches.
Join all the tables and then perform your query.
Create dynamic queries for each of your users.
Both approaches will require search_from and search_to to be available beforehand i.e you need to calculate each user's search range before you do anything.
EG:
WITH users_per_group AS (
SELECT
user_id, group_id
,DATE_SUB(parse_date("%Y%m%d", start_date), INTERVAL 4 DAY)search_from
,DATE_SUB(parse_date("%Y%m%d", start_date), INTERVAL 1 DAY)search_to
FROM TableName
)
Once you have this kind of table then you can use any of the mentioned approaches.
Since I don't have your data and don't know about your table names I am giving an example using a public dataset.
Approach 1
-- consider this your main table which contains user,grp,start_date,end_date
with maintable as (
select 'India' visit_from, '20161115' as start_date, '20161202' end_date
union all select 'Sweden' , '20161201', '20161202'
),
--then calculate search from-to date for every user and group
user_per_grp as(
select *, DATE_SUB(parse_date("%Y%m%d", start_date), INTERVAL 4 DAY)search_from --change interval as per your need
,DATE_SUB(parse_date("%Y%m%d", start_date), INTERVAL 1 DAY)search_to
from maintable
)
select visit_from,_TABLE_SUFFIX date,count(visitId) total_visits from
user_per_grp ug
left join `bigquery-public-data.google_analytics_sample.ga_sessions_*` as pub on pub.geoNetwork.country = ug.visit_from
where _TABLE_SUFFIX between format_date("%Y%m%d",ug.search_from) and format_date("%Y%m%d",ug.search_to)
group by 1,2
Approach 2
declare queries array<string> default [];
create temp table maintable as (
select 'India' visit_from, '20161115' as start_date, '20161202' end_date
union all select 'Sweden' , '20161201', '20161202'
);
create temp table user_per_grp as(
select *, DATE_SUB(parse_date("%Y%m%d", start_date), INTERVAL 4 DAY)search_from
,DATE_SUB(parse_date("%Y%m%d", start_date), INTERVAL 1 DAY)search_to
from maintable
);
-- for each user create a seperate query here
FOR record IN (SELECT * from user_per_grp)
DO
set queries = queries || [format('select "%s" Visit_From,_TABLE_SUFFIX Date,count(visitId) total_visits from `bigquery-public-data.google_analytics_sample.ga_sessions_*` where _TABLE_SUFFIX between format_date("%%Y%%m%%d","%t") and format_date("%%Y%%m%%d","%t") and geoNetwork.country="%s" group by 1,2',record.visit_from,record.search_from,record.search_to,record.visit_from)];
--replace your query here.
END FOR;
--aggregating all the queries and executing it
execute immediate (select string_agg(query, ' union all ') from unnest(queries) query);
Here the 2nd approach processed much less data(~750 KB) than the 1st approach(~17 MB). But that might not be the same for your dataset as the date range may overlap for 2 users and that will lead to reading the same table twice.

Getting two rows in output in bigquery if using case

When i m running this query giving result in two different rows of same date one contains zero other contains events count????
How to solve this, any help will be really appreciated!
(Select
distinct(case
when event_text = 'poll_vote' THEN device_id Else 0 END) as
pollvote,event_date from
(Select event_date,event_text,count(distinct users) as device_id from
(SELECT event.name as event_text, ( user.value.value.string_value)
AS users,
CAST(TIMESTAMP_ADD(TIMESTAMP_MICROS(event.timestamp_micros),
INTERVAL 330 MINUTE) AS date) AS event_date
FROM
`dataset.tablename`,
UNNEST(event_dim) AS event,
UNNEST(user_dim.user_properties) AS user
where
user.key="context_device_id"
GROUP BY
event_date,event_text,users)
GROUP BY
event_text,event_date))

Using ‘GROUP BY’ for event_date only should give you only one column as you wanted. Here are some of the GROUP BY examples.

Calculating business days in Teradata

I need help in business days calculation.
I've two tables
1) One table ACTUAL_TABLE containing order date and contact date with timestamp datatypes.
2) The second table BUSINESS_DATES has each of the calendar dates listed and has a flag to indicate weekend days.
using these two tables, I need to ensure business days and not calendar days (which is the current logic) is calculated between these two fields.
My thought process was to first get a range of dates by comparing ORDER_DATE with TABLE_DATE field and then do a similar comparison of CONTACT_DATE to TABLE_DATE field. This would get me a range from the BUSINESS_DATES table which I can then use to calculate count of days, sum(Holiday_WKND_Flag) fields making the result look like:
Order# | Count(*) As DAYS | SUM(WEEKEND DATES)
100 | 25 | 8
However this only works when I use a specific order number and cant' bring all order numbers in a sub query.
My Query:
SELECT SUM(Holiday_WKND_Flag), COUNT(*) FROM
(
SELECT
* FROM
BUSINESS_DATES
WHERE BUSINESS.Business BETWEEN (SELECT ORDER_DATE FROM ACTUAL_TABLE
WHERE ORDER# = '100'
)
AND
(SELECT CONTACT_DATE FROM ACTUAL_TABLE
WHERE ORDER# = '100'
)
TEMP
Uploading the table structure for your reference.

SELECT ORDER#, SUM(Holiday_WKND_Flag), COUNT(*)
FROM business_dates bd
INNER JOIN actual_table at ON bd.table_date BETWEEN at.order_date AND at.contact_date
GROUP BY ORDER#

Instead of joining on a BETWEEN (which always results in a bad Product Join) followed by a COUNT you better assign a bussines day number to each date (in best case this is calculated only once and added as a column to your calendar table). Then it's two Equi-Joins and no aggregation needed:
WITH cte AS
(
SELECT
Cast(table_date AS DATE) AS table_date,
-- assign a consecutive number to each busines day, i.e. not increased during weekends, etc.
Sum(CASE WHEN Holiday_WKND_Flag = 1 THEN 0 ELSE 1 end)
Over (ORDER BY table_date
ROWS Unbounded Preceding) AS business_day_nbr
FROM business_dates
)
SELECT ORDER#,
Cast(t.contact_date AS DATE) - Cast(t.order_date AS DATE) AS #_of_days
b2.business_day_nbr - b1.business_day_nbr AS #_of_business_days
FROM actual_table AS t
JOIN cte AS b1
ON Cast(t.order_date AS DATE) = b1.table_date
JOIN cte AS b2
ON Cast(t.contact_date AS DATE) = b2.table_date
Btw, why are table_date and order_date timestamp instead of a date?
Porting from Oracle?

You can use this query. Hope it helps
select order#,
order_date,
contact_date,
(select count(1)
from business_dates_table
where table_date between a.order_date and a.contact_date
and holiday_wknd_flag = 0
) business_days
from actual_table a

Combine two queries with monthly average

I need to put together the results of these two queries into a single return with the following structure:
"date", avg(selic."Taxa"), avg(titulos."puVenda")
Partial structure of tables:
selic
"dtFechamento" date,
"pTaxa" real
titulos
"dtTitulo" date,
"puVenda" real,
"nomeTitulo" character(30)
Query table selic:
select to_char("dtFechamento", 'YYYY-MM') as data, avg("pTaxa")
from "selic"
group by data
order by data
Query table titulos:
select to_char("dtTitulo", 'YYYY-MM') as data, avg("puVenda")
from "titulos"
where "nomeTitulo" = 'LFT010321'
group by data
order by data
I tried a subquery, but it returned the fields next to each other and can not muster.
select *
from (select to_char("dtFechamento", 'YYYY-MM') as data, avg("pTaxa")
from "selic"
group by data
order by data) as selic,
(select to_char("dtTitulo", 'YYYY-MM') as data, avg("puVenda")
from "titulos"
where "nomeTitulo" = 'LFT010321'
group by data
order by data) as LFT010321;

Assuming you want to return one row per month where either of your two queries returns a row. And pad missing values from the other query with NULL.
Use a FULL [OUTER] JOIN:
SELECT to_char(mon, 'YYYY-MM') AS data, s.avg_taxa, t.avg_venda
FROM (
SELECT date_trunc('month', "dtFechamento") AS mon, avg("pTaxa") AS avg_taxa
FROM selic
GROUP BY 1
) s
FULL JOIN (
SELECT date_trunc('month', "dtTitulo") AS mon, avg("puVenda") AS avg_venda
FROM titulos
WHERE "nomeTitulo" = 'LFT010321'
GROUP BY 1
) t USING (mon)
ORDER BY mon;
It is substantially faster to join after aggregating than before (fewer join operations).
It is also faster to GROUP BY, JOIN and ORDER on timestamp values than on a text rendition. Typically also cleaner and less error prone (although text is unambiguous in this particular case). That's why I use date_trunc() instead of to_char() on lower levels.
If the format for the month is not important, you can just return the timestamp value. Else you can format any way you like after you are done processing.
Similar case with more explanation:
PostgreSQL merge two queries with COUNT and GROUP BY in each

This should get what you need. The inner "PQ" (PreQuery) does a union all between each possible date, but also adds a flag column to identify which average it was associated with. Each part is grouped by date. So now, the outer query will AT MOST have 2 records for a given date... one for tax, the other be Venda. So now you dont need any full outer join, nor need to build some dynamic calendar data basis to get the details for all possible dates.
So, it is possible for only a Tax average OR a Venda average OR BOTH.
SELECT
PQ.Data,
SUM( CASE when PQ.SumType = 'T' then PQ.TypeAvg else 0 end ) as AvgTax,
SUM( CASE when PQ.SumType = 'V' then PQ.TypeAvg else 0 end ) as AvgVenda
from
( select
to_char( dtFechamento, 'YYYY-MM') as data,
'T' as sumtype,
avg( pTaxa ) as TypeAvg
from
selic
group by
to_char( dtFechamento, 'YYYY-MM') as data
UNION ALL
select
to_char( dtTitulo, 'YYYY-MM') as data,
'V' as sumType,
avg( puVenda ) as TypeAvg
from
titulos
where
nomeTitulo = 'LFT010321'
group by
to_char( dtTitulo, 'YYYY-MM') ) PQ
group by
PQ.Data
order by
PQ.Data

SQL Group By for quarterly dates

transaction_date is in a date format.
What I'm actually trying to output is the COUNT DISTINCT of Unique_ID by quarter (i.e., how many times did a Unique_Id appear in a given quarter).
SELECT transaction_date ,
UNIQUE_ID,
FROM panel
WHERE (some criteria = 'x')
GROUP BY UNIQUE_ID

try this :
SELECT datepart(quarter,transaction_date),
count(distinct UNIQUE_ID) as cnt
FROM panel
WHERE (some criteria = 'x')
GROUP BY datepart(quarter,p.transaction_date)
but the count(distinct) will do a sort so it will take you a lot of time. so you can distinct it first in the table then do the count
SELECT datepart(quarter,p.transaction_date),
count(p.UNIQUE_ID) as cnt
FROM (select distinct transaction_date as transaction_date, UNIQUE_ID
from panel) as p
WHERE (some criteria = 'x')
GROUP BY datepart(quarter,p.transaction_date)

I'd use date_trunc:
select
date_trunc ('quarter', transaction_date), count (distinct unique_id)
from panel
where criteria = 'x'
group by 1
This presupposes that when you say "by quarter" that 1Q2015 is different than 1Q2014.

SELECT DATEPART(QUARTER, transaction_date) ,
COUNT(DISTINCT UNIQUE_ID),
FROM panel
GROUP BY transaction_date

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Count Distinct is less than Sum(Count Distinct) - sql

Because the same user has multiple events, so the event is counted multiple times when counted at the user level. It is hard to be more descriptive without sample data.

Related

BigQuery: iterating groups within a window of 28days before a start_date column using _TABLE_SUFFIX

Getting two rows in output in bigquery if using case

Calculating business days in Teradata

Combine two queries with monthly average

SQL Group By for quarterly dates

Categories

Resources