How to do a left outer join on inequality? - sql

I have the following SQL query on BigQuery. I am trying to join two different tables one of which is a much smaller table than the other. At first I used the regular join but this results in elimination of some of the data that I am working with.
with weekly_periods as(
select
ticket_id,
start_time_in_minutes_from_week,
raw_delta_in_minutes,
week_number,
greatest(0, start_time_in_minutes_from_week - week_number * (7 * 24 * 60)) as ticket_week_start_time,
least(start_time_in_minutes_from_week + raw_delta_in_minutes - week_number * (7 * 24 * 60),(7 * 24 * 60)) as ticket_week_end_time
from
ticket_solved_time,
unnest(generate_array(0, floor((start_time_in_minutes_from_week + raw_delta_in_minutes) / (7 * 24 * 60)), 1)) as week_number,
intercepted_periods as(
select
ticket_id,
week_number,
ticket_week_start_time,
ticket_week_end_time,
schedule.start_time as schedule_start_time,
schedule.end_time as schedule_end_time,
least(ticket_week_end_time, schedule.end_time) - greatest(ticket_week_start_time, schedule.start_time) as scheduled_minutes
from
weekly_periods
left join
schedule
on ticket_week_start_time <= schedule.end_time
and ticket_week_end_time >= schedule.start_time
But I am receiving an error of: -- LEFT OUTER JOIN cannot be used without a condition that is an equality of fields from both sides of the join --
How would it be possible to do this join while preserving the data? If I just do JOIN the query doesn't not return the full result.
Thank you!

What you need to do is use a cross join and then add your join condition in where clause as shown below:
with weekly_periods as(
select
ticket_id,
start_time_in_minutes_from_week,
raw_delta_in_minutes,
week_number,
greatest(0, start_time_in_minutes_from_week - week_number * (7 * 24 * 60)) as ticket_week_start_time,
least(start_time_in_minutes_from_week + raw_delta_in_minutes - week_number * (7 * 24 * 60),(7 * 24 * 60)) as ticket_week_end_time
from
ticket_solved_time,
unnest(generate_array(0, floor((start_time_in_minutes_from_week + raw_delta_in_minutes) / (7 * 24 * 60)), 1)) as week_number,
intercepted_periods as(
select
ticket_id,
week_number,
ticket_week_start_time,
ticket_week_end_time,
schedule.start_time as schedule_start_time,
schedule.end_time as schedule_end_time,
least(ticket_week_end_time, schedule.end_time) - greatest(ticket_week_start_time, schedule.start_time) as scheduled_minutes
from
weekly_periods
Cross join
schedule
Where ticket_week_start_time <= schedule.end_time
and ticket_week_end_time >= schedule.start_time

You need to join the tables based on a relational key using an = operator (example below) then use where to implement your criteria... Since you didn't post your table structure, this is just an example of the correct way to join
left join
schedule
on schedule.id = weekly_period.ticketid and
weekly_period.ticketid = intercepted_period.ticketid
where
ticket_week_start_time <= schedule.end_time
and ticket_week_end_time >= schedule.start_time

Related

How to reduce the fetching time in view table?

I have tried to fetch the data from view table .I have executed select query in phpmyadmin .It takes 5.2875 seconds. But normal table takes only 0.0300 seconds.
Query :
SELECT line FROM `summary_view` WHERE date='2022-02-25'
CREATE VIEW summary_view AS
select
adh.id AS id,
adh.line AS line,
fam.area AS area,
fam.sub_area AS sub_area,
fam.family AS family,
pro.produced AS produced,
pro.service AS service,
round(((testdb.down * 60) / adh.takt_time),0) AS units_lost,
round(((((adh.worked_time * 60) - adh.break_time) * 60) / adh.takt_time),0) AS oa_capacity,
round(((((adh.plan_time * 60) - adh.break_time) * 60) / adh.takt_time),0) AS ay_capacity,
testdb.machines AS machines,
testdb.manpower AS manpower,
testdb.material AS material,
testdb.methods AS methods,
testdb.misc AS misc,
testdb.down AS down,
round((((pro.produced + pro.service) * adh.takt_time) / 60),0) AS uptime,
adh.break_time AS break_time,
round(((adh.worked_time * 60) - (((((pro.produced + pro.service) * adh.takt_time) / 60) + testdb.down) + adh.break_time)),0) AS minute_error,
round((adh.worked_time * 60),0) AS working_time,
round(((adh.worked_time * 60) - adh.break_time),0) AS worked_time,
round(((adh.plan_time * 60) - adh.break_time),0) AS plan_time,
round(adh.takt_time,0) AS takt_time,
NULL AS headcount,
rep.nff AS nff,
rep.dpu AS dpu,
round(((((pro.produced + pro.service) * adh.takt_time) / 60) / ((adh.worked_time * 60) - adh.break_time)),4) AS oa,
round(((((pro.produced + pro.service) * adh.takt_time) / 60) / ((adh.plan_time * 60) - adh.break_time)),4) AS ay,
adh.shift AS shift,
GET_MONTH(adh.date) AS month,
GET_WEEK(adh.date) AS week,
adh.date AS date,
adh.skey AS skey
from
(
(
(
(
adhoc adh
left join
(
select
sum(if((down.category = 'Machines'),down.duration,NULL)) AS machines,
sum(if((down.category = 'Manpower'),down.duration,NULL)) AS manpower,
sum(if((down.category = 'Material'),down.duration,NULL)) AS material,
sum(if((down.category = 'Methods'),down.duration,NULL)) AS methods,
sum(if((down.category = 'Misc'),down.duration,NULL)) AS misc,
sum(down.duration) AS down,
down.skey AS skey
from (down join family on((down.line = family.line)))
where ((down.reason <> 'Scheduled Shutdown') and (family.area in ('Indoor','Outdoor','Specialty')))
group by down.skey
)
testdb on((adh.skey = testdb.skey))
)
left join
(
select
count(if((prod.repair_flag <> 'S'),1,NULL)) AS produced,
count(if((prod.repair_flag = 'S'),1,NULL)) AS service,
prod.skey AS skey
from (prod join family on((prod.line = family.line)))
group by prod.skey
)
pro on((adh.skey = pro.skey))
)
left join
(
select
count(if((repair.level_one = 'No Fault Found'),1,NULL)) AS nff,
count(if(((repair.level_one <> 'No Fault Found') and (repair.level_two <> 'Reclaim Refrigerant')),1,NULL)) AS dpu,
repair.skey AS skey
from repair
group by repair.skey
)
rep on((adh.skey = rep.skey))
)
join family fam on((adh.line = fam.line))
)
where (adh.area = 'Assembly')
;
How to reduce the query execution time in view table ?
Is there any way to reduce the execution time without adding index ?

Synapse serverless TPC-H Query15 wrong syntax

I am trying the TPC-H Queries, they all worked fine except Number 15, basically supplier_no was not recognized, do you know how to rewrite it, the only change I made for all Queries is to replace limit by top
SELECT
--Query15
s_suppkey,
s_name,
s_address,
s_phone,
total_revenue
FROM
supplier,
(
SELECT
l_suppkey AS supplier_no,
SUM(l_extendedprice * (1 - l_discount)) AS total_revenue
FROM
lineitem
WHERE
l_shipdate >= CAST('1996-01-01' AS date)
AND l_shipdate < CAST('1996-04-01' AS date)
GROUP BY
supplier_no
) revenue0
WHERE
s_suppkey = supplier_no
AND total_revenue = (
SELECT
MAX(total_revenue)
FROM
(
SELECT
l_suppkey AS supplier_no,
SUM(l_extendedprice * (1 - l_discount)) AS total_revenue
FROM
lineitem
WHERE
l_shipdate >= CAST('1996-01-01' AS date)
AND l_shipdate < CAST('1996-04-01' AS date)
GROUP BY
supplier_no
) revenue1
)
ORDER BY
s_suppkey;
If you are getting the following errors you just need to make sure that you are referring to the source column name (l_suppkey) in this case, not the alias (supplier_no) in this case:
Msg 207, Level 16, State 1, Line 1 Invalid column name 'supplier_no'.
Msg 164, Level 15, State 1, Line 1 Each GROUP BY expression must
contain at least one column that is not an outer reference.
A full statement which has been tested against a dedicated SQL pool in Azure Synapse Analytics:
SELECT
--Query15
s_suppkey,
s_name,
s_address,
s_phone,
total_revenue
FROM
supplier,
(
SELECT
l_suppkey AS supplier_no,
SUM(l_extendedprice * (1 - l_discount)) AS total_revenue
FROM
lineitem
WHERE
l_shipdate >= CAST('1996-01-01' AS date)
AND l_shipdate < CAST('1996-04-01' AS date)
GROUP BY
l_suppkey
) revenue0
WHERE
s_suppkey = supplier_no
AND total_revenue = (
SELECT
MAX(total_revenue)
FROM
(
SELECT
l_suppkey AS supplier_no,
SUM(l_extendedprice * (1 - l_discount)) AS total_revenue
FROM
lineitem
WHERE
l_shipdate >= CAST('1996-01-01' AS date)
AND l_shipdate < CAST('1996-04-01' AS date)
GROUP BY
l_suppkey
) revenue1
)
ORDER BY
s_suppkey;
NB SQL Server has the ability to refer to the alias in the ORDER BY statement but not the GROUP BY.
Re related discussion on performance on Azure Synapse Serverless SQL Pools:
Just for fun, I repartitioned my TPC-H SF10 dbo.lineitem table by l_shipdate, added the filepath() metadata function to filter on and got the warm query down to 1 sec, 7 seconds on first run. So some caching did seem to be in play.
I realise you have not had to do these very query-specific optimisations for the other platforms but I wanted to see if it was possible to improve the performance.
I suppose Q14 is to test specific transformation rules in the respective db engines:
The query:
;WITH cte AS
(
SELECT
l_suppkey,
SUM(l_extendedprice * (1 - l_discount)) AS total_revenue
FROM OPENROWSET(
BULK 'enriched/tpch/tpch10/lineitem_partitioned/*/*.parquet',
DATA_SOURCE = 'MyDataSource',
FORMAT = 'PARQUET'
) x
WHERE x.filepath(1) = 1996
AND l_shipdate Between CAST('1996-01-01' AS DATE) And CAST('1996-04-01' AS DATE)
GROUP BY l_suppkey
)
SELECT
s.s_suppkey,
s.s_name,
s.s_address,
s.s_phone,
c.total_revenue
FROM ext.supplier s
INNER JOIN cte c ON s.s_suppkey = c.l_suppkey
WHERE total_revenue = ( SELECT MAX(total_revenue) FROM cte );

Attempting to calculate absolute change and % change in 1 query

I'm having trouble with the SELECT portion of this query. I can calculate the absolute change just fine, but when I want to also find out the percent change I get lost in all the subqueries. Using BigQuery. Thank you!
SELECT
station_name,
ridership_2013,
ridership_2014,
absolute_change_2014 / ridership_2013 * 100 AS percent_change,
(ridership_2014 - ridership_2013) AS absolute_change_2014,
It will probably be beneficial to organize your query with CTEs and descriptive aliases to make things a bit easier. For example...
with
data as (select * from project.dataset.table),
ridership_by_year as (
select
extract(year from ride_date) as yr,
count(*) as rides
from data
group by 1
),
ridership_by_year_and_station as (
select
extract(year from ride_date) as yr,
station_name,
count(*) as rides
from data
group by 1,2
),
yearly_changes as (
select
this_year.yr,
this_year.rides,
prev_year.rides as prev_year_rides,
this_year.rides - coalesce(prev_year.rides,0) as absolute_change_in_rides,
safe_divide( this_year.rides - coalesce(prev_year.rides), prev_year.rides) as relative_change_in_rides
from ridership_by_year this_year
left join ridership_by_year prev_year on this_year.yr = prev_year.yr + 1
),
yearly_station_changes as (
select
this_year.yr,
this_year.station_name,
this_year.rides,
prev_year.rides as prev_year_rides,
this_year.rides - coalesce(prev_year.rides,0) as absolute_change_in_rides,
safe_divide( this_year.rides - coalesce(prev_year.rides), prev_year.rides) as relative_change_in_rides
from ridership_by_year this_year
left join ridership_by_year prev_year on this_year.yr = prev_year.yr + 1
)
select * from yearly_changes
--select * from yearly_station_changes
Yes this is a bit longer, but IMO it is much easier to understand.

H2 SQL Query column not found in scope

In relation with : SQL: Find unavailability per minute of a ressource in an appointment
((click above link for schema + info))
I'm trying to run this query in an H2 SQL database. I'm a little unfamiliar with H2 syntax. I noted the column num in the WHERE clause that causes the issue.
Error:
Column "NUM" not found; SQL statement:
CREATE FORCE VIEW (
SELECT
"A"."APPOINTMENT_ID",
"A"."APPOINTMENT_START_TIME",
"A"."APPOINTMENT_END_TIME",
"C"."COMPONENT_HOST_NAME",
'unavailable' AS "STATE"
FROM "PUBLIC"."APPOINTMENT" "A"
LEFT OUTER JOIN "PUBLIC"."APPOINTMENT_COMPONENT" "AC"
ON "A"."APPOINTMENT_ID" = "AC"."APPOINTMENT_ID"
INNER JOIN "PUBLIC"."COMPONENT" "C"
ON "C"."COMPONENT_ID" = "AC"."COMPONENT_ID"
WHERE ((CONVERT("A"."APPOINTMENT_START_TIME", TIME) <= DATEADD('minute', "NUM", CAST('00:00:00' AS TIME)))
AND (CONVERT("A"."APPOINTMENT_END_TIME", TIME) >= DATEADD('minute', "NUM", CAST('00:00:00' AS TIME))))
AND ("C"."COMPONENT_ID" IN(1))
FETCH FIRST ROW ONLY
) AS
SELECT
"A"."APPOINTMENT_ID",
"A"."APPOINTMENT_START_TIME",
"A"."APPOINTMENT_END_TIME",
"C"."COMPONENT_HOST_NAME",
'unavailable' AS "STATE"
FROM "PUBLIC"."APPOINTMENT" "A"
LEFT OUTER JOIN "PUBLIC"."APPOINTMENT_COMPONENT" "AC"
ON "A"."APPOINTMENT_ID" = "AC"."APPOINTMENT_ID"
INNER JOIN "PUBLIC"."COMPONENT" "C"
ON "C"."COMPONENT_ID" = "AC"."COMPONENT_ID"
WHERE ((CONVERT("A"."APPOINTMENT_START_TIME", TIME) <= DATEADD('minute', "NUM", CAST('00:00:00' AS TIME)))
AND (CONVERT("A"."APPOINTMENT_END_TIME", TIME) >= DATEADD('minute', "NUM", CAST('00:00:00' AS TIME))))
AND ("C"."COMPONENT_ID" IN(1))
FETCH FIRST ROW ONLY [42122-200] 42S22/42122 (Help)
My code:
with times(num) as
(
select 30 as num
union all select (num + 30)
from times where num < (24*60)
)
select dateadd('minute', num, cast('00:00:00' as time)) as datetimeinterval, unavailabilities.state from times
outer join(
select top 1 a.appointment_id, a.appointment_start_time, a.appointment_end_time, c.component_host_name, 'unavailable' as state
from appointment a
left join appointment_component ac on a.appointment_id = ac.appointment_id
inner join component c on c.component_id = ac.component_id
where
dateadd('minute', -->num<--, cast('00:00:00' as time)) between convert(a.appointment_start_time, time) and convert(a.appointment_end_time, time)
and
c.component_id in (1)
) unavailabilities
TLDR: Trying to get unavailabilities of a list of components by the minute or by a range of minutes (30 minutes here). Num should return a multiple of 30 in this case, depending on the time frame selected for which it will check if the components are taken or not.
N.B. I changed machine=component and appmach=appointment_component (cross table) from the link above
I am not sure about the syntax. I am not very much familiar with H2. But isn't num should be used in on clause instead of subquery ? Please check:
with recursive times(num) as
(
select 30 as num
union all select (num + 30)
from times where num < (24*60)
)
select dateadd('minute', num, cast('00:00:00' as time)) as datetimeinterval, unavailabilities.state from times
outer join(
select top 1 a.appointment_id, a.appointment_start_time, a.appointment_end_time, c.component_host_name, 'unavailable' as state
from appointment a
left join appointment_component ac on a.appointment_id = ac.appointment_id
inner join component c on c.component_id = ac.component_id
where
dateadd('minute', -->num<--, cast('00:00:00' as time)) between convert(a.appointment_start_time, time) and convert(a.appointment_end_time, time)
and
c.component_id in (1)
) unavailabilities
on dateadd('minute', num, cast('00:00:00' as time)) between convert(appointment_start_time, time) and convert(appointment_end_time, time)

SQL query from Oracle SQL to T-SQL

I have a subquery which is used for an Oracle database, but I want to use an equivalent query for a SQL Server database.
I didn't figure out how to migrate the TO_TIMESTAMP(TO_CHAR(TO_DATE part and also didn't know how to handle the thing with rownums in T-SQL.
Is it even possible to migrate this query?
SELECT 0 run_id,
0 tran_id,
0 sort_id,
' ' tran_type,
10 prod_id,
72 type_id,
1 value,
TO_TIMESTAMP(TO_CHAR(TO_DATE('2016-03-18 00:00:00', 'YYYY.MM.DD HH24:MI:SS') + rownum -1, 'YYYY.MM.DD') || to_char(sw.end_time, 'HH24:MI:SS'), 'YYYY.MM.DD HH24:MI:SS') event_publication,
EXTRACT (YEAR
FROM (TO_DATE('2016-03-18 00:00:00', 'YYYY.MM.DD HH24:MI:SS') + rownum -1)) y,
EXTRACT (MONTH
FROM (TO_DATE('2016-03-18 00:00:00', 'YYYY.MM.DD HH24:MI:SS') + rownum -1)) mo,
EXTRACT (DAY
FROM (TO_DATE('2016-03-18 00:00:00', 'YYYY.MM.DD HH24:MI:SS') + rownum -1)) d,
to_number(to_char (sw.end_time, 'HH24')) h,
to_number(to_char (sw.end_time, 'MI')) mi,
to_number(to_char (sw.end_time, 'SS')) s,
0 ms
FROM all_objects ao,
settlement_win sw,
prod_def pd
WHERE pd.prod_id = 10
AND sw.country = pd.country
AND sw.commodity = pd.commodity
AND rownum <= TO_DATE('2016-03-18 23:59:00', 'YYYY.MM.DD HH24:MI:SS') -TO_DATE('2016-03-18 00:00:00', 'YYYY.MM.DD HH24:MI:SS')+1
The first thing to address is the use of rownum which has no direct equivalent in TSQL but we can mimic it, and for this particular query you need to recognize that the table ALL_OBJECTS is only being used to produce a number of rows. It has no other purpose to the query.
In TSQL we can generate rows using a CTE and there are many many variants of this, but for here I suggest:
;WITH
cteDigits AS (
SELECT 0 AS digit UNION ALL SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL
SELECT 5 UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9
)
, cteTally AS (
SELECT
d1s.digit
+ d10s.digit * 10
+ d100s.digit * 100 /* add more like this as needed */
-- + d1000s.digit * 1000 /* add more like this as needed */
+ 1 AS rownum
FROM cteDigits d1s
CROSS JOIN cteDigits d10s
CROSS JOIN cteDigits d100s /* add more like this as needed */
--CROSS JOIN cteDigits d1000s /* add more like this as needed */
)
This will quickly spin-up 1000 rows as is and can be extended to produce many more rows by adding more cross joins. Note this returns a column called rownum which starts at 1 thus mimicking the Oracle rownum.
So next you can just add some of the remaining query, like this:
SELECT
0 run_id
, 0 tran_id
, 0 sort_id
, ' ' tran_type
, 10 prod_id
, 72 type_id
, 1 value
, convert(varchar, dateadd(day, rownum - 1,'20160318'),121) event_publication
-- several missing rows here
, 0 ms
FOM cteTally
INNER JOIN settlement_win sw
INNER JOIN prod_def pd ON sw.country = pd.country AND sw.commodity = pd.commodity
WHERE pd.prod_id = 10
AND rownum <= datediff(day,'20160318','20160318') + 1
Note that you really do not need a to_timestamp() equivalent you just need the ability to output date and time to the maximum precision of your data which appears to be to the level of seconds.
To progress further (I think) requires an understanding of the data held in the column sw.end_time. If this can be converted to the mssql datetime data type then it is just a matter of adding a number of days to that value to arrive at the event_publication and similarly if sw.end_time is converted to a datetime data type then use date_part() to get the hours, minutes and seconds from that column. e.g.
, DATEADD(day,rownum-1,CONVERT(datetime, sw.end_time)) AS event_publication
also, if such a calculation works then it would be possible to use an apply operator to simplify the overall query, something like this
;WITH
cteDigits AS (
SELECT 0 AS digit UNION ALL SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL
SELECT 5 UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9
)
, cteTally AS (
SELECT
d1s.digit
+ d10s.digit * 10
+ d100s.digit * 100 /* add more like this as needed */
-- + d1000s.digit * 1000 /* add more like this as needed */
+ 1 AS rownum
FROM cteDigits d1s
CROSS JOIN cteDigits d10s
CROSS JOIN cteDigits d100s /* add more like this as needed */
--CROSS JOIN cteDigits d1000s /* add more like this as needed */
)
SELECT
0 run_id
, 0 tran_id
, 0 sort_id
, ' ' tran_type
, 10 prod_id
, 72 type_id
, 1 value
, convert(varchar(23), CA.Event_publication, 121) Event_publication
, datepart(day,CA.Event_publication) dd
, datepart(month,CA.Event_publication) mm
, datepart(year,CA.Event_publication) yyyy
, datepart(hour,CA.Event_publication) hh24
, datepart(minute,CA.Event_publication) mi
, datepart(second,CA.Event_publication) ss
, 0 ms
FOM cteTally
INNER JOIN settlement_win sw
INNER JOIN prod_def pd ON sw.country = pd.country AND sw.commodity = pd.commodity
CROSS APPLY (
SELECT DATEADD(day,rownum-1,CONVERT(datetime, sw.end_time)) AS event_publication ) CA
WHERE pd.prod_id = 10
AND rownum <= datediff(day,'20160318','20160318') + 1
NB: IT may be necessary to include this datediff(day,'19000101,'20160318') (which equals 42445) into the calculation of the event_date e.g.
SELECT DATEADD(day,42445 + (rownum-1),CONVERT(datetime, sw.end_time)) AS event_publication
One last point is that you could use datetime2 instead of datetime if you really do need a greater degree of time precision but there is no easily apparent requirement for that.