Left Join Lateral is Very Slow - sql

I have the following query
WITH time_series AS (
SELECT *
FROM generate_series(now() - interval '1days', now(), INTERVAL '1 hour') AS ts
), recent_instances AS (
SELECT instance_id,
(CASE WHEN last_update_granted_ts IS NOT NULL THEN last_update_granted_ts ELSE created_ts END),
version,
4 status
FROM instance_application
WHERE group_id=$1
AND last_check_for_updates >= now() - interval '1days'
ORDER BY last_update_granted_ts DESC
), instance_versions AS (
SELECT instance_id, created_ts, version, status
FROM instance_status_history
WHERE instance_id IN (SELECT instance_id
FROM recent_instances)
AND status = 4
UNION
(SELECT * FROM recent_instances)
ORDER BY created_ts DESC
)
SELECT ts,
(CASE WHEN version IS NULL THEN '' ELSE version END),
sum(CASE WHEN version IS NOT null THEN 1 ELSE 0 END) total
FROM (
SELECT *
FROM time_series
LEFT JOIN LATERAL (
SELECT distinct ON (instance_id) instance_Id, version, created_ts
FROM instance_versions
WHERE created_ts <= time_series.ts
ORDER BY instance_Id, created_ts DESC
) _ ON true
) AS _
GROUP BY 1,2
ORDER BY ts DESC;
So instance_versions subquery is executed with every value of timestamps generated from time_series query(see the last select statement). But for some reason the lateral join is very slow,the rows returned by the subquery of lateral join ranges in around 12k-15k(for a single timestamp from time_series query) which is not a big number and the final no of rows returned after the Lateral join ranges from 250k-350k. Is there a way i can optimize this?

Related

Hive query takes forever on Superset

I have a query that was written in Presto SQL format (100 lines of insert a query result to a table that already exists) and takes within 10 minutes to get the result.
Now I am going to use Airflow and need to change the query to Hive SQL format to append previous month's data, there is no error, but it is taking 75+ minutes now and the query is still running and not returning any result.
Shall I 'stop' it or is there anything else to consider?
SET hive.limit.query.max.table.partition = 1000000;
INSERT INTO TABLE schema.temp_tbl partition(year_month_key)
Select
distinct
tbl.account_id,
tbl.theme_status,
streaming.streaming_hours,
tbl.year_month as year_month_key
From
(
Select
tbl_0.year_month,
tbl_0.account_id,
case when max(tbl_0.theme_status) = 1 then 'With Theme' else 'No Theme' end as theme_status
From
(Select
streaming.year_month,
streaming.account_id,
case when theme_events.account_id is not null then 1 else 0 end as theme_status
from
(
Select
substring(date_key, 1, 7) as year_month,
last_day(add_months(date_key, -1)) as year_month_ed,
date_key,
upper(account_id) as account_id,
play_seconds
from agg_device_streaming_metrics_daily
Where date_key between date_add(last_day(add_months(current_date, -2)),1) and last_day(add_months(current_date, -1))
and play_seconds > 0
) streaming
left join
(
Select
upper(theme.virtualuserid) as account_id,
min(theme.createddate) as min_createddate,
min(theme.date_key) as date_key
From
(
select * from theme_activate_event_history
where date_key between '2019-01-01' and '2020-01-01'
and activate = 'true' and themetype in ('ThemeBundle','ScreenSaver','Skin','Audio')
union
select * from theme_activate_event_history
where date_key between '2020-01-01' and '2021-01-01'
and activate = 'true' and themetype in ('ThemeBundle','ScreenSaver','Skin','Audio')
union
select * from theme_activate_event_history
where date_key between '2021-01-01' and '2022-01-01'
and activate = 'true' and themetype in ('ThemeBundle','ScreenSaver','Skin','Audio')
union
select * from theme_activate_event_history
where date_key between cast('2022-01-01' as date) and last_day(add_months(current_date, -1))
and activate = 'true' and themetype in ('ThemeBundle','ScreenSaver','Skin','Audio')
) theme
group by theme.virtualuserid
) theme_events
on streaming.account_id = theme_events.account_id
and date(theme_events.date_key) <= date(streaming.year_month_ed)
) tbl_0
group by tbl_0.year_month, tbl_0.account_id
) tbl
inner join
(Select
substring(date_key, 1, 7) as year_month,
upper(account_id) as account_id,
cast(sum(play_seconds) / 3600 as double) as streaming_hours
from agg_device_streaming_metrics_daily
Where date_key between date_add(last_day(add_months(current_date, -2)),1) and last_day(add_months(current_date, -1))
and play_seconds > 0
group by substring(date_key, 1, 7), upper(account_id)
) streaming
on tbl.account_id = streaming.account_id and tbl.year_month = streaming.year_month;

Invalid request 'group by' (oracle)

Please tell me how to fix the error. An error Expression not in GROUP BY key 'isin'.
I understand that I am doing the grouping incorrectly, but I do not know how to redo the code for this request correctly. Here you need to find the maximum value of end_circ and the minimum value of begin_circ for the key stocks_full_id. It is necessary to display all columns from select together with max and min.
SELECT a.isin as id,
a.state_number as number,
a.update_time as valid_from_date,
'2999-12-31 00:00:00' as valid_to_date,
a.operdate as oper,
a.inn as inn_num,
a.name_eng as name,
coalesce(ts.full_name_eng,a.name_eng) as full_nm,
max (stg.end_circ) as end_date,
min (stg.begin_circ) as start_date,
case when sk.name_eng IS NULL then sk.name_uk else sk.name_eng end as subtype_nm
FROM (SELECT s.*, rank() over (PARTITION BY isin,state_number ORDER BY operdate desc) as rn
FROM stocks s
WHERE isin IS NOT NULL and state_number IS NOT NULL) a
JOIN trading_stocks ts ON ts.emission_is=a.id
JOIN stocks_trading_grounds stg ON stg.stocks_full_id=a.id
JOIN stocks_kinds sk ON sk.id=a.kind_id
WHERE stg.end_circ >= "2021-01-01 00:00:00" and a.rn=1
GROUP BY stg.stocks_full_id
GROUP BY for the individual table within the JOIN:
SELECT a.isin as id,
a.state_number as number,
a.update_time as valid_from_date,
'2999-12-31 00:00:00' as valid_to_date,
a.operdate as oper,
a.inn as inn_num,
a.name_eng as name,
coalesce(ts.full_name_eng,a.name_eng) as full_nm,
stg.end_date,
stg.start_date,
case when sk.name_eng IS NULL then sk.name_uk else sk.name_eng end as subtype_nm
FROM (SELECT s.*,
rank() over (PARTITION BY isin,state_number ORDER BY operdate desc) as rn
FROM stocks s
WHERE isin IS NOT NULL
and state_number IS NOT NULL
) a
JOIN trading_stocks ts ON ts.emission_is=a.id
JOIN stocks_kinds sk ON sk.id=a.kind_id
JOIN (
SELECT stocks_full_id,
max(stg.end_circ) as end_date,
min(stg.begin_circ) as start_date
FROM stocks_trading_grounds
GROUP BY stocks_full_id
) stg
ON stg.stocks_full_id=a.id
WHERE stg.end_date >= DATE '2021-01-01'
AND a.rn=1
Also, double quotes are not used for a string literal; they are used for a quoted identifier. Either use single quotes '2021-01-01 00:00:00' for a string literal or DATE '2021-01-01' for a date literal.
You have to include all the non-aggregated columns in your group by clause. So you updated query would be -
SELECT a.isin as id,
a.state_number as number,
a.update_time as valid_from_date,
'2999-12-31 00:00:00' as valid_to_date,
a.operdate as oper,
a.inn as inn_num,
a.name_eng as name,
coalesce(ts.full_name_eng,a.name_eng) as full_nm,
max (stg.end_circ) as end_date,
min (stg.begin_circ) as start_date,
case when sk.name_eng IS NULL then sk.name_uk else sk.name_eng end as subtype_nm
FROM (SELECT s.*, rank() over (PARTITION BY isin,state_number ORDER BY operdate desc) as rn
FROM stocks s
WHERE isin IS NOT NULL and state_number IS NOT NULL) a
JOIN trading_stocks ts ON ts.emission_is=a.id
JOIN stocks_trading_grounds stg ON stg.stocks_full_id=a.id
JOIN stocks_kinds sk ON sk.id=a.kind_id
WHERE stg.end_circ >= '2021-01-01 00:00:00' and a.rn=1
GROUP BY a.isin,
a.state_number,
a.update_time,
a.operdate,
a.inn,
a.name_eng,
coalesce(ts.full_name_eng,a.name_eng),
case when sk.name_eng IS NULL then sk.name_uk else sk.name_eng end;

SQL - find row with closest date but different column value

i'm new to SQL and i would need an help.
I have a TAB and I need to find for any item B in the TAB the item A with the closest date. In this case the A with 02.09.2021 04:25:30
Date.
Item
07.09.2021 05:02:05
A
06.09.2021 05:01:02
A
05.09.2021 05:00:02
A
04.09.2021 04:59:01
A
03.09.2021 04:58:03
A
02.09.2021 04:56:55
A
02.09.2021 04:33:56
B
02.09.2021 04:25:30
A
WITH CTE(DATE,ITEM)AS
(
SELECT '20210907 05:02:05' , 'A'UNION ALL
SELECT '20210906 05:01:02' , 'A'UNION ALL
SELECT '20210905 05:00:02' , 'A'UNION ALL
SELECT'20210904 04:59:01' , 'A'UNION ALL
SELECT'20210903 04:58:03' , 'A'UNION ALL
SELECT'20210902 04:56:55' , 'A'UNION ALL
SELECT'20210902 04:33:56' , 'B'UNION ALL
SELECT'20210902 04:25:30' , 'A'
)
SELECT
CAST(C.DATE AS DATETIME)X_DATE,C.ITEM,Q.CLOSEST
FROM CTE AS C
OUTER APPLY
(
SELECT TOP 1 CAST(X.DATE AS DATETIME)CLOSEST
FROM CTE AS X
WHERE X.ITEM='A'AND CAST(X.DATE AS DATETIME)<CAST(C.DATE AS DATETIME)
ORDER BY CAST(X.DATE AS DATETIME) ASC
)Q
WHERE C.ITEM='B'
You can use OUTER APPLY-approach as in the above query.
Please also take a look that datetime-column (DATE)is written in the ISO-compliant form
Your data has only two columns. If you want the only the closest A timestamp, then the fastest way is probably window functions:
select t.*,
(case when prev_a_date is null then next_a_date
when next_a_date is null then prev_a_date
when datediff(second, prev_a_date, date) <= datediff(second, date, next_a_date) then prev_a_date
else next_a_date
end) as a_date
from (select t.*,
max(case when item = 'A' then date end) over (order by date) as prev_a_date,
min(case when item = 'A' then date end) over (order by date desc) as next_a_date
from t
) t
where item = 'B';
This uses seconds to measure the time difference, but you can use a smaller unit if appropriate.
You can also do this using apply if you have more columns from the "A" rows that you want:
select tb.*, ta.*
from t b outer apply
(select top (1) ta.*
from t ta
where item = 'A'
order by abs(datediff(second, a.date, b.date))
) t
where item = 'B';

Identify date range and merge into max and min dates

I have data ( int, date , date types)
SELECT * FROM
(
VALUES
(1700171048,'2020-12-21','2021-01-03'),
(1700171048,'2021-01-05','2021-01-12'),
(1700171048,'2021-01-13','2021-01-17'),
(1700171048,'2021-01-18','2021-01-19'),
(1700171048,'2021-01-22','2021-01-27'),
(1700171048,'2021-01-28','2021-02-17')
(1700171049,'2020-12-21','2021-01-03'),
(1700171049,'2021-01-04','2021-01-05'),
(1700171049,'2021-01-06','2021-01-17'),
(1700171049,'2021-01-18','2021-01-19'),
(1700171049,'2021-01-20','2021-01-27'),
(1700171049,'2021-01-28','2021-02-17')
) AS c (id1, st, endt )
I need output( i.e. if start and end dates are continuous then make it part of group )
id1 st endt
1700171048 '2020-12-21' , '2021-01-03'
1700171048 '2021-01-05' , '2021-01-19'
1700171048 '2021-01-22' , '2021-02-17'
1700171049 '2020-12-21' to '2021-02-17'
I tried this, won't work.
select id, case when min(b.st) = max(b.endt) + 1 then min(b.st) end,
case when min(b.endt) = min(b.st) + 1 then max(b.st) end
from c a join c b
group by id
This is a type of gaps-and-islands problem. Use lag() to identify if there is an overlap. Then a cumulative sum of when there is no overlaps and aggregation:
select id1, min(st), max(endt)
from (select t.*,
sum(case when prev_endt >= st + interval '-1 day' then 0 else 1 end) over (partition by id1 order by st) as grp
from (select t.*,
lag(endt) over (partition by id1 order by st) as prev_endt
from t
) t
) t
group by id1, grp;
Here is a db<>fiddle.

Oracle SQL Hierarchy Summation

I have a table TRANS that contains the following records:
TRANS_ID TRANS_DT QTY
1 01-Aug-2020 5
1 01-Aug-2020 1
1 03-Aug-2020 2
2 02-Aug-2020 1
The expected output:
TRANS_ID TRANS_DT BEGBAL TOTAL END_BAL
1 01-Aug-2020 0 6 6
1 02-Aug-2020 6 0 6
1 03-Aug-2020 6 2 8
2 01-Aug-2020 0 0 0
2 02-Aug-2020 0 1 1
2 03-Aug-2020 1 0 1
Each trans_id starts with a beginning balance of 0 (01-Aug-2020). For succeeding days, the beginning balance is the ending balance of the previous day and so on.
I can create PL/SQL block to create the output. Is it possible to get the output in 1 SQL statement?
Thanks.
Try this following script using CTE-
Demo Here
WITH CTE
AS
(
SELECT DISTINCT A.TRANS_ID,B.TRANS_DT
FROM your_table A
CROSS JOIN (SELECT DISTINCT TRANS_DT FROM your_table) B
),
CTE2
AS
(
SELECT C.TRANS_ID,C.TRANS_DT,SUM(D.QTY) QTY
FROM CTE C
LEFT JOIN your_table D
ON C.TRANS_ID = D.TRANS_ID
AND C.TRANS_DT = D.TRANS_DT
GROUP BY C.TRANS_ID,C.TRANS_DT
ORDER BY C.TRANS_ID,C.TRANS_DT
)
SELECT F.TRANS_ID,F.TRANS_DT,
(
SELECT COALESCE (SUM(QTY), 0) FROM CTE2 E
WHERE E.TRANS_ID = F.TRANS_ID AND E.TRANS_DT < F.TRANS_DT
) BEGBAL,
(
SELECT COALESCE (SUM(QTY), 0) FROM CTE2 E
WHERE E.TRANS_ID = F.TRANS_ID AND E.TRANS_DT = F.TRANS_DT
) TOTAL ,
(
SELECT COALESCE (SUM(QTY), 0) FROM CTE2 E
WHERE E.TRANS_ID = F.TRANS_ID AND E.TRANS_DT <= F.TRANS_DT
) END_BAL
FROM CTE2 F
You can as well do like this (I would assume it's a bit faster): Demo
with
dt_between as (
select mindt + level - 1 as trans_dt
from (select min(trans_dt) as mindt, max(trans_dt) as maxdt from t)
connect by level <= maxdt - mindt + 1
),
dt_for_trans_id as (
select *
from dt_between, (select distinct trans_id from t)
),
qty_change as (
select distinct trans_id, trans_dt,
sum(qty) over (partition by trans_id, trans_dt) as total,
sum(qty) over (partition by trans_id order by trans_dt) as end_bal
from t
right outer join dt_for_trans_id using (trans_id, trans_dt)
)
select
trans_id,
to_char(trans_dt, 'DD-Mon-YYYY') as trans_dt,
nvl(lag(end_bal) over (partition by trans_id order by trans_dt), 0) as beg_bal,
nvl(total, 0) as total,
nvl(end_bal, 0) as end_bal
from qty_change q
order by trans_id, trans_dt
dt_between returns all the days between min(trans_dt) and max(trans_dt) in your data.
dt_for_trans_id returns all these days for each trans_id in your data.
qty_change finds difference for each day (which is TOTAL in your example) and cumulative sum over all the days (which is END_BAL in your example).
The main select takes END_BAL from previous day and calls it BEG_BAL, it also does some formatting of final output.
First of all, you need to generate dates, then you need to aggregate your values by TRANS_DT, and then left join your aggregated data to dates. The easiest way to get required sums is to use analitic window functions:
with dates(dt) as ( -- generating dates between min(TRANS_DT) and max(TRANS_DT) from TRANS
select min(trans_dt) from trans
union all
select dt+1 from dates
where dt+1<=(select max(trans_dt) from trans)
)
,trans_agg as ( -- aggregating QTY in TRANS
select TRANS_ID,TRANS_DT,sum(QTY) as QTY
from trans
group by TRANS_ID,TRANS_DT
)
select -- using left join partition by to get data on daily basis for each trans_id:
dt,
trans_id,
nvl(sum(qty) over(partition by trans_id order by dates.dt range between unbounded preceding and 1 preceding),0) as BEGBAL,
nvl(qty,0) as TOTAL,
nvl(sum(qty) over(partition by trans_id order by dates.dt),0) as END_BAL
from dates
left join trans_agg tr
partition by (trans_id)
on tr.trans_dt=dates.dt;
Full example with sample data:
alter session set nls_date_format='dd-mon-yyyy';
with trans(TRANS_ID,TRANS_DT,QTY) as (
select 1,to_date('01-Aug-2020'), 5 from dual union all
select 1,to_date('01-Aug-2020'), 1 from dual union all
select 1,to_date('03-Aug-2020'), 2 from dual union all
select 2,to_date('02-Aug-2020'), 1 from dual
)
,dates(dt) as ( -- generating dates between min(TRANS_DT) and max(TRANS_DT) from TRANS
select min(trans_dt) from trans
union all
select dt+1 from dates
where dt+1<=(select max(trans_dt) from trans)
)
,trans_agg as ( -- aggregating QTY in TRANS
select TRANS_ID,TRANS_DT,sum(QTY) as QTY
from trans
group by TRANS_ID,TRANS_DT
)
select
dt,
trans_id,
nvl(sum(qty) over(partition by trans_id order by dates.dt range between unbounded preceding and 1 preceding),0) as BEGBAL,
nvl(qty,0) as TOTAL,
nvl(sum(qty) over(partition by trans_id order by dates.dt),0) as END_BAL
from dates
left join trans_agg tr
partition by (trans_id)
on tr.trans_dt=dates.dt;
You can use a recursive query to generate the overall date range, cross join it with the list of distinct tran_id, then bring the table with a left join. The last step is aggregation and window functions:
with all_dates (trans_dt, max_dt) as (
select min(trans_dt), max(trans_dt) from trans group by trans_id
union all
select trans_dt + interval '1' day, max_dt from all_dates where trans_dt < max_dt
)
select
i.trans_id,
d.trans_dt,
coalesce(sum(sum(t.qty)) over(partition by i.trans_id order by d.trans_dt), 0) - coalesce(sum(t.qty), 0) begbal,
coalesce(sum(t.qty), 0) total,
coalesce(sum(sum(t.qty)) over(partition by i.trans_id order by d.trans_dt), 0) endbal
from all_dates d
cross join (select distinct trans_id from trans) i
left join trans t on t.trans_id = i.trans_id and t.trans_dt = d.trans_dt
group by i.trans_id, d.trans_dt
order by i.trans_id, d.trans_dt