rewriting group by in subselect - google-bigquery

I have 2 queries and both the queries are showing different results when moving the where clause in inner select query.
select destination_number, count(*) as cnt from (
select caller_id_number as caller_id_number, destination_number as destination_number
from Final1 where start_stamp > '2013-01-01 00:00:00' and start_stamp < '2013-01-26 00:00:00'
group by caller_id_number, destination_number)
where destination_number = '1234'
group by destination_number
count result: 294636
select destination_number, count(*) as cnt from (
select caller_id_number as caller_id_number, destination_number as destination_number
from Final1 where destination_number = '1234' and start_stamp > '2013-01-01 00:00:00' and start_stamp < '2013-01-26 00:00:00'
group by caller_id_number, destination_number)
group by destination_number
count result: 310627
I will like to know why does the count change and which query is correct.

Related

Inserting parameters to SQL query in Oracle SQL

In the following query between date time columns are repeated in multiple places and I need to replace them with two variables named start_date and end_date I tried multiple methods and had no luck. Please answer with a runnable query if you can. Thanks in advance.
WITH encounter
AS (SELECT patient_pomr_id AS encounter_number,
patient_id AS umrn,
doctor_id,
doctor_name
FROM eh_pomr.ehpom_patient_pomr
WHERE created_on BETWEEN timestamp '2022-08-01 00:00:00' AND
timestamp '2022-08-30 00:00:00'),
chief_complain
AS (SELECT chief_complain,
patient_pomr_id
FROM eh_pomr.ehpom_chief_complain),
admission
AS (SELECT admitted_date,
patient_id,
ADMISSION_ID,
admission_type AS encounter_type,
patient_pomr_id,
hospital_id,
clinic_name
FROM ad_request.admlm_admission
WHERE direct_admission IS NULL
AND is_from_er != 1
AND created_date BETWEEN timestamp '2022-08-01 00:00:00' AND
timestamp '2022-08-30 00:00:00'),
ip_create_admission
AS (SELECT patientpomr,
dbms_lob.Substr(admitting_diagnosis, 2000, 1) diagnosis
FROM eh_ip.ehip_create_admission
WHERE created_on BETWEEN timestamp '2022-08-01 00:00:00' AND
timestamp '2022-08-30 00:00:00'),
discharge
AS (SELECT CASE
WHEN dischargevia = 1 THEN 'Private Vehicle'
WHEN dischargevia = 2 THEN 'Ambulatory'
WHEN dischargevia = 3 THEN 'Other'
ELSE ' Unknown'
END AS dischargevia,
pomrid,
modifiedon AS discharge_date,
conditionondischarge AS discharge_speciality
FROM eh_ndischarge.ehipd_dischargedetails
WHERE isactive = 1),
death
AS (SELECT dbms_lob.Substr(underlying_cause, 2000, 1) cause_of_death,
patientpomr
FROM eh_ip.ehip_death_detail),
empi
AS (SELECT id_number,
mrn
FROM rf_empi.emred_patients),
vitals
AS (SELECT PR.id,
PR.patient_pomr_id,
FS.field_code,
FS.value
FROM eh_commmon.ehcom_patient_record PR
left join eh_commmon.ehcom_flow_sheet_data FS
ON PR.id = FS.patient_record_id
WHERE PR.flow_sheet_code = 'vitals'
AND FS.time_stamp BETWEEN timestamp '2022-08-01 00:00:00' AND
timestamp '2022-08-30 00:00:00'),
leaves
AS (SELECT requesting_days,
visit_id,
ADM.PATIENT_POMR_ID
FROM ad_request.admlm_med_leave_final_print FP
left join ad_request.admlm_medical_leave ML
ON FP.request_id = ML.request_id
LEFT JOIN AD_REQUEST.ADMLM_ADMISSION ADM
ON ML.VISIT_ID = ADM.ADMISSION_ID
WHERE FP.leave_status = 5
AND ML.created_date BETWEEN timestamp '2022-08-01 00:00:00' AND
timestamp '2022-08-30 00:00:00'
AND ML.REQUESTING_DAYS IS NOT NULL)
SELECT DISTINCT encounter.encounter_number,
admission.encounter_type,
empi.id_number AS Patient_National_ID,
admission.patient_id AS umrn,
admission.admitted_date,
admission.hospital_id,
admission.clinic_name AS admission_speciality,
chief_complain.chief_complain,
leaves.requesting_days AS Duration_of_leave,
encounter.doctor_id,
encounter.doctor_name,
ip_create_admission.diagnosis,
discharge.dischargevia,
discharge.discharge_date,
discharge_speciality,
admission.clinic_name AS clinic,
death.cause_of_death
-- VITALS.field_code,
-- VITALS.value
FROM admission
left join empi
ON admission.patient_id = empi.mrn
left join encounter
ON admission.patient_pomr_id = encounter.encounter_number
left join ip_create_admission
ON admission.patient_pomr_id = ip_create_admission.patientpomr
--admission_request_numbrer with adt
left join discharge
ON admission.patient_pomr_id = discharge.pomrid
left join death
ON admission.patient_pomr_id = death.patientpomr
left join chief_complain
ON admission.patient_pomr_id = chief_complain.patient_pomr_id
left join leaves
ON admission.patient_pomr_id = leaves.PATIENT_POMR_ID
I tried adding with begin and end tags with declare key words but had no luck. Also is there a special way to insert variable using in to keyword when we need to insert it for between?
Include yet another CTE (I'm calling it dates) which is then cross-joined in another CTEs which utilize these values. Something like this:
WITH
dates (start_date, end_date) --> this is new CTE
AS (SELECT timestamp '2022-08-01 00:00:00',
timestamp '2022-08-30 00:00:00'
FROM dual),
encounter
AS (SELECT patient_pomr_id AS encounter_number,
patient_id AS umrn,
doctor_id,
doctor_name
FROM eh_pomr.ehpom_patient_pomr
CROSS JOIN dates d --> it is used here
WHERE created_on BETWEEN d.start_date AND d.end_date), --> like this
chief_complain
AS ..
This is from MSSQL, you can try converting this through OracleSQL
#dateFrom datetime = null,
#dateTo datetime = null,
DATEADD(D, 0, DATEDIFF(D, 0, #DateFrom))
AND DATEADD(D, 0, DATEDIFF(D, 0, #DateTo))

How to optimize selection of pairs from one column of the table?

I'm using PostgreSQL 9.5.19, DBeaver 6.3.4
I have a table where one row is - user's name, place he attended, time when he was there
I need to select all pairs of places where any user was (if user was at place a and place b i need row like this: user, place a, place b, time at place a, time at place b)
The ponds table:
CREATE TABLE example.example (
tm timestamp NOT NULL,
place_name varchar NOT NULL,
user_name varchar NOT NULL
);
Some sample data:
INSERT INTO example.example (tm, place_name, user_name)
values
('2020-02-25 00:00:19.000', 'place_1', 'user_1'),
('2020-03-25 00:00:19.000', 'place_2', 'user_1'),
('2020-02-25 00:00:19.000', 'place_1', 'user_2'),
('2020-03-25 00:00:19.000', 'place_1', 'user_3'),
('2020-02-25 00:00:19.000', 'place_2', 'user_3');
I'm trying this script:
select
t.user_name
,t.place_name as r1_place
,max(t.tm) as r1_tm
,t2.place_name as r2_place
,min(t2.tm) as r2_tm
from example.example as t
join example.example as t2 on t.user_name = t2.user_name
and t.tm < t2.tm
and t.place_name <> t2.place_name
where t.tm between '2020-02-25 00:00:00' and '2020-03-25 15:00:00'
and t2.tm between '2020-02-25 00:00:00' and '2020-03-25 15:00:00'
group by t.user_name
, t.place_name
, t2.place_name
Seems like it gives me the right result, but it works really slow.
Can I optimize it somehow?
I would suggest trying indexes. For this query:
select t.user_name, t.place_name as r1_place, max(t.tm) as r1_tm,
t2.place_name as r2_place, min(t2.tm) as r2_tm
from schema.table t join
schema.table t2
on t.user_name = t2.user_name and
t.tm < t2.tm and
t.place_name <> t2.place_name
where t.tm between '2020-03-25 00:00:00' and '2020-03-25 15:00:00' and
t2.tm between '2020-03-25 00:00:00' and '2020-03-25 15:00:00'
group by t.user_name, t.place_name, t2.place_name
I would suggest an index on (tm, user_name, place_name) and on (user_name, tm, place_name) -- yes, both, one for each reference.
Colleague helped me to create window function:
select
subq.*
,EXTRACT(EPOCH FROM (subq.next_tm - subq.tm)) as seconds_diff
from (
select
t1.user_name,
t1.place_name,
t1.tm,
lead(t1.place_name) over w as next_place_name,
lead(t1.tm) over w as next_tm
from example.example as t1
window w as (partition by t1.user_name order by tm asc)
)subq
where
next_place_name is not null
and next_tm is not null
and place_name <> next_place_name
;

Invalid column reference in hive

When I run the below query, I get the error "Invalid column reference: cnt". Any suggestions would be great !!
select count(customer) as cnt from (
select customer, concat(visid, lowid), count(name)
from tab1 where date_time between '2017-05-01 00:00:00' and '2017-05-31 23:59:59' and name in ('payment: Complete', 'check: Complete')
group by evar71, concat(visid, lowid)) t1
where cnt > 1;
Another way to do it.
select count(customer) as cnt from (
select customer, concat(visid, lowid), count(name)
from tab1 where date_time between '2017-05-01 00:00:00' and '2017-05-31 23:59:59' and name in ('payment: Complete', 'check: Complete')
group by evar71, concat(visid, lowid)) t1
having count(customer) > 1;
WHERE filter applied before aggregation
that is why where cnt > 1 does not work. There is HAVING keyword which introduces a condition on aggregations, it works as filter after aggregation.
select count(customer) cnt
...
where rows_filter_condition_here --before aggregation
having count(customer) > 1 --aggregation results filter
order by cnt desc --this works after aggregation
I think hive prefers aliases in the group by. In addition, several column aliases are not correct:
select count(customer) as cnt
from (select customer, concat(visid, lowid) as ids, count(name) as cc
from tab1
where date_time >= '2017-05-01' and date_time < '2017-06-01' and
name in ('payment: Complete', 'check: Complete')
group by customer, ids
) t1
where cc > 1;

SQL Server : combining COUNT(*) for different tables at once

I have a few queries that I would like to combine into ONE query in order to not have to call out to the server multiple times.
An example of the queries I am using:
SELECT COUNT(*) AS mailCount1
FROM [WebContact].[dbo].[memberEmails]
WHERE contactdatetime > '01/01/06'
AND contactdatetime < '02/01/06'
SELECT COUNT(*) AS mailCount2
FROM [WebContact].[dbo].[otherEmails]
WHERE contactdatetime > '01/01/06'
AND contactdatetime < '02/01/06'
SELECT COUNT(*) AS mailCount3
FROM [WebContact].[dbo].[memberEmails]
WHERE contactdatetime > '02/01/06'
AND contactdatetime < '03/01/06'
SELECT COUNT(*) AS mailCount4
FROM [WebContact].[dbo].[otherEmails]
WHERE contactdatetime > '02/01/06'
AND contactdatetime < '03/01/06'
etc etc...
So as the examples above, only thing that changes are:
The FROM (memberEmails & otherEmails)
The > & < months (01/01/06, 02/01/06 | 02/01/06, 03/01/06 | etc...)
Is this possible to do with a single query?
First, use group by and just use two queries:
select year(contactdatetime) as yyyy, month(contactdatetime) as mm, count(*)
from WebContact].[dbo].[memberEmails]
group by year(contactdatetime), month(contactdatetime);
and:
select year(contactdatetime) as yyyy, month(contactdatetime) as mm, count(*)
from WebContact].[dbo].[otherEmails]
group by year(contactdatetime), month(contactdatetime);
Then, if you like, you can combine these into a single query:
select coalesce(me.yyyy, oe.yyyy) as yyyy, coalesce(me.mm, oe.mm) as mm,
coalesce(me.cnt, 0) as memberemailcnt,
coalesce(oe.cnt, 0) as otheremailcnt
from (select year(contactdatetime) as yyyy, month(contactdatetime) as mm, count(*) as cnt
from WebContact].[dbo].[memberEmails]
group by year(contactdatetime), month(contactdatetime)
) me full outer join
(select year(contactdatetime) as yyyy, month(contactdatetime) as mm, count(*) as cnt
from WebContact].[dbo].[otherEmails]
group by year(contactdatetime), month(contactdatetime)
) oe
on me.yyyy = oe.yyyy and me.mm = oe.mm;
A full outer join is not necessary if both tables have data for all months.
declare #emailCount table(tablename varchar(20), year int, month int, qty int)
insert into #emailCount
select 'memberEmails', year(contactdatetime), month(contactdatetime), count(*)
from [WebContact].[dbo].[memberEmails]
group by year(contactdatetime), month(contactdatetime)
insert into #emailCount
select 'otherEmails',year(contactdatetime), month(contactdatetime), count(*)
from [WebContact].[dbo].[otherEmails]
group by year(contactdatetime), month(contactdatetime)
select tablename, year, month, qty from #emailCount
Add WHERE clause if needed to restrict date ranges. (edit- simplified to use year() and month() functions.)
I haven't check the syntax or performance but you can do something like this,
WITH cte (
countvalue
,description
)
AS (
SELECT COUNT(*)
,'mailCount1'
FROM [WebContact].[dbo].[memberEmails]
WHERE contactdatetime > '01/01/06'
AND contactdatetime < '02/01/06'
UNION ALL
SELECT COUNT(*)
,'mailCount2'
FROM [WebContact].[dbo].[otherEmails]
WHERE contactdatetime > '01/01/06'
AND contactdatetime < '02/01/06'
UNION ALL
SELECT COUNT(*)
,'mailCount3'
FROM [WebContact].[dbo].[memberEmails]
WHERE contactdatetime > '02/01/06'
AND contactdatetime < '03/01/06'
UNION ALL
SELECT COUNT(*)
,'mailCount4'
FROM [WebContact].[dbo].[otherEmails]
WHERE contactdatetime > '02/01/06'
AND contactdatetime < '03/01/06'
)
SELECT mailCount1
,mailCount2
,mailCount3
,mailCount4
FROM (
SELECT countvalue
,description
FROM cte
) d
pivot(max(countvalue) FOR description IN (mailCount1, mailCount2, mailCount3, mailCount4)) piv;
Hope this helps..

TERADATA: query optimization

This query is working but it seems to take longer time than usual to retrieve the data. Is there a better solution to optimize this query? I need to get all PRD_ID from T1 and T2 even if there is no match with S1 and S2.
SELECT DISTINCT T.PRD_ID T.AMOUNT, T.DATE, T.REGION
FROM
(
SELECT DISTINCT T1.PRD_ID, T1.PRD_CODE, S1.ORDER_DATE AS DATE, T1.REGION
FROM
(
(SELECT PRD_ID, PRD_CODE,AMOUNT,REGION
FROM PRODUCT
WHERE REGION='CA') T1
LEFT JOIN SERVICE_1 S1
ON S1.PRD_ID = T1.PRD_ID
AND S1.PRD_CODE= T1.PRD_CODE
AND S1.AMT = T1.AMOUNT
AND S1.ORDER_DATE >= '01/01/2015'
AND S1.ORDER_DATE <= '02/28/2015'
)
UNION ALL
SELECT DISTINCT T2.PRD_ID, T2.PRD_CODE, S2.ACCT_CALENDAR_DT AS DATE, T2.REGION
FROM
(
(SELECT PRD_ID, PRD_CODE,AMOUNT,REGION
FROM PRODUCT
WHERE REGION='IL') T2
LEFT JOIN SERVICE_2 S2
ON S2.PRD_ID = T2.PRD_ID
AND S2.PRD_CODE= T2.PRD_CODE
AND S2.AMT = T2.AMOUNT
AND S2.ACCT_CALENDAR_DT >= '20150101'
AND S2.ACCT_CALENDAR_DT <= '20150228'
)
) T
ORDER BY REGION, ORDER_DATE DESC, PRD_ID
I can't see why you need all these (3!) levels of nested tables. The following should be equivalent:
SELECT DISTINCT
T1.PRD_ID, T1.PRD_CODE, S1.ORDER_DATE AS DATE, T1.REGION
FROM
PRODUCT T1
LEFT JOIN SERVICE_1 S1
ON S1.PRD_ID = T1.PRD_ID
AND S1.PRD_CODE= T1.PRD_CODE
AND S1.AMT = T1.AMOUNT
AND S1.ORDER_DATE >= DATE '2015-01-01' -- converted '01/01/2015'
AND S1.ORDER_DATE <= DATE '2015-02-28' -- converted '02/28/2015'
WHERE T1.REGION = 'CA'
UNION ALL -- No need for DISTINCT here. The Region
-- is different between the 2 parts.
SELECT DISTINCT
T2.PRD_ID, T2.PRD_CODE, S2.ACCT_CALENDAR_DT AS DATE, T2.REGION
FROM
PRODUCT T2
LEFT JOIN SERVICE_2 S2
ON S2.PRD_ID = T2.PRD_ID
AND S2.PRD_CODE= T2.PRD_CODE
AND S2.AMT = T2.AMOUNT
AND S2.ACCT_CALENDAR_DT >= DATE '2015-01-01'
AND S2.ACCT_CALENDAR_DT <= DATE '2015-02-28'
WHERE T2.REGION = 'IL'
ORDER BY REGION, DATE DESC, PRD_ID ;
or:
SELECT DISTINCT
T1.PRD_ID, T1.PRD_CODE, S1.ORDER_DATE AS DATE, 'CA' AS REGION
FROM
( SELECT PRD_ID, PRD_CODE, AMOUNT
FROM PRODUCT
WHERE REGION = 'CA'
) T1
LEFT JOIN SERVICE_1 S1
ON S1.PRD_ID = T1.PRD_ID
AND S1.PRD_CODE= T1.PRD_CODE
AND S1.AMT = T1.AMOUNT
AND S1.ORDER_DATE >= DATE '2015-01-01'
AND S1.ORDER_DATE <= DATE '2015-02-28'
UNION ALL
SELECT DISTINCT
T2.PRD_ID, T2.PRD_CODE, S2.ACCT_CALENDAR_DT AS DATE, 'IL' AS REGION
FROM
( SELECT PRD_ID, PRD_CODE, AMOUNT
FROM PRODUCT
WHERE REGION = 'IL'
) T2
LEFT JOIN SERVICE_2 S2
ON S2.PRD_ID = T2.PRD_ID
AND S2.PRD_CODE= T2.PRD_CODE
AND S2.AMT = T2.AMOUNT
AND S2.ACCT_CALENDAR_DT >= DATE '2015-01-01'
AND S2.ACCT_CALENDAR_DT <= DATE '2015-02-28'
ORDER BY REGION, DATE DESC, PRD_ID ;