hive : full outer join result has repetition rows - hive

every table has remove validation row key, sql like
select count(*)
from
dw_rk.f_t_rk_dzxx tb1 full
outer join (
select
s1.gmsfhm as sfzhm,
s1.sjjzd_dzbm,
s1.qxqc,
s1.xzjd,
s1.qxc,
s1.pcs,
s1.jws,
s1.dm,
s1.mph,
s1.xqqc,
s1.lfmc,
s1.dy,
s1.fh,
s1.dzqc,
from_unixtime(unix_timestamp(), 'yyyy-MM-dd HH:dd:ss') as d_timestamp,
'gaj' as d_deptname,
'f_s_zw_gaj_ldzh_syrk_rkjbxxb' as d_tabname
from
(
select
t0.gmsfhm,
t0.sjjzd_dzbm,
case
when t2.qxmc is null then t1.qxmc
else t2.qxmc
end as qxqc,
t1.xzjd as xzjd,
case
when t3.qxc is null then t1.qxc
else t3.qxc
end as qxc,
t1.pcs as pcs,
t1.jwqmc as jws,
t1.dm as dm,
t1.mph as mph,
t1.xqmc as xqqc,
t1.lfmc as lfmc,
t1.dy as dy,
t1.fh as fh,
t1.dzqc as dzqc,
row_number() over (
partition by t0.gmsfhm
order by
t0.gxsj,
t0.rksj desc
) as num
from
dws.f_s_zw_gaj_ldzh_syrk_rkjbxxb t0
left join dws.f_s_zw_ldzh_bzdz_dzjbxxb t1 on t0.sjjzd_dzbm = t1.dzbh
left join (
select
distinct ssxqbm,
qxmc
from
dws.f_s_zw_ldzh_bzdz_dzjbxxb
) t2 on t0.sjjzd_ssxqdm = t2.ssxqbm
left join (
select
distinct qxcbh,
qxc
from
dws.f_s_zw_ldzh_bzdz_dzjbxxb
) t3 on t0.sjjzd_sqjcwhdm = t3.qxcbh
where
t0.gmsfhm is not null
) s1
where
s1.num = 1
) tb2 on tb1.sfzhm = tb2.sfzhm;
explain is
STAGE DEPENDENCIES:
Stage-7 is a root stage
" Stage-12 depends on stages: Stage-7, Stage-13 , consists of Stage-15, Stage-3"
Stage-15 has a backup stage: Stage-3
Stage-11 depends on stages: Stage-15
" Stage-10 depends on stages: Stage-3, Stage-8, Stage-11 , consists of Stage-14, Stage-4"
Stage-14 has a backup stage: Stage-4
Stage-9 depends on stages: Stage-14
" Stage-5 depends on stages: Stage-4, Stage-9"
Stage-1 depends on stages: Stage-5
Stage-4
Stage-3
Stage-8 is a root stage
Stage-16 is a root stage
Stage-13 depends on stages: Stage-16
Stage-0 depends on stages: Stage-1
when I create middle table to get table2`s data,result is right

Related

Query on large table

I have the following query -
SELECT n.fname
,ii.render AS practitioner_npi
,n.address AS address1
,substring(n.postal,0,6) AS zip
,substring(n.postal,6,4) AS zip4
,ii.count AS count
FROM
(
SELECT render, count(*) AS count
FROM dx sl
JOIN annual caq
ON DATE_TRUNC('quarter', date_of_service::date) >= caq.start
JOIN entities n
ON sl.render = n.npi
WHERE dx_cd IN (
SELECT DISTINCT dx_cd
FROM dx_per_code pc
JOIN bucket bac
ON pc.code = bac.hcpccode
WHERE
bucketname = 'something'
AND dx_rank BETWEEN 1 AND 5
)
AND n.npi_type = '1'
GROUP BY render
)
ii
JOIN npi n ON n.npi = ii.render
LEFT JOIN taxonomy t ON t.code = n.taxonomy
ORDER BY ii.count DESC;
The dx table does not have any indexes and contains approax 8B records. This query currently takes 20 minutes to run. What indexes/optimizations can I make to get this to run faster?

Hive table with multiple partitions

I have a table (data_table) with multiple partition columns year/month/monthkey.
Directories look something like year=2017/month=08/monthkey=2017-08/files.parquet
Which of the below queries would be faster?
select count(*) from data_table where monthkey='2017-08'
or
select count(*) from data_table where monthkey='2017-08' and year = '2017' and month = '08'
I think the initial time taken by hadoop take to find the required directories in the first case would be more. But want to confirm
Finding the relevant partitions is a metastore operation and not a file system operation.
It is done by querying the metasore and not by scanning the directories.
The metasore query of the first use-case will most likely be faster than the second use-case but in any case we are talking here on fractions of a second.
Demo
create external table t100k(i int)
partitioned by (x int,y int,xy string)
;
explain dependency select count(*) from t100k where xy='100-1000';
The query that was issued against the metastore:
select "PARTITIONS"."PART_ID"
from "PARTITIONS"
inner join "TBLS" on "PARTITIONS"."TBL_ID" = "TBLS"."TBL_ID" and "TBLS"."TBL_NAME" = 't100k'
inner join "DBS" on "TBLS"."DB_ID" = "DBS"."DB_ID" and "DBS"."NAME" = 'local_db'
inner join "PARTITION_KEY_VALS" "FILTER2" on "FILTER2"."PART_ID" = "PARTITIONS"."PART_ID" and "FILTER2"."INTEGER_IDX" = 2
where (("FILTER2"."PART_KEY_VAL" = '100-1000'))
explain dependency select count(*) from t100k where x=100 and y=1000 and xy='100-1000';
The query that was issued against the metastore:
select "PARTITIONS"."PART_ID"
from "PARTITIONS"
inner join "TBLS" on "PARTITIONS"."TBL_ID" = "TBLS"."TBL_ID" and "TBLS"."TBL_NAME" = 't100k'
inner join "DBS" on "TBLS"."DB_ID" = "DBS"."DB_ID" and "DBS"."NAME" = 'local_db'
inner join "PARTITION_KEY_VALS" "FILTER0" on "FILTER0"."PART_ID" = "PARTITIONS"."PART_ID" and "FILTER0"."INTEGER_IDX" = 0
inner join "PARTITION_KEY_VALS" "FILTER1" on "FILTER1"."PART_ID" = "PARTITIONS"."PART_ID" and "FILTER1"."INTEGER_IDX" = 1
inner join "PARTITION_KEY_VALS" "FILTER2" on "FILTER2"."PART_ID" = "PARTITIONS"."PART_ID" and "FILTER2"."INTEGER_IDX" = 2
where ( ( (((case when "FILTER0"."PART_KEY_VAL" <> '__HIVE_DEFAULT_PARTITION__' then cast("FILTER0"."PART_KEY_VAL" as decimal(21,0)) else null end) = 100)
and ((case when "FILTER1"."PART_KEY_VAL" <> '__HIVE_DEFAULT_PARTITION__' then cast("FILTER1"."PART_KEY_VAL" as decimal(21,0)) else null end) = 1000))
and ("FILTER2"."PART_KEY_VAL" = '100-1000')) )
Since comment will change the formatting, hence posting here.
Kindly accept #Dudu's reply. Please execute the below on metastore DB (mysql in my case):
mysql> select part_id, location, tbl_id, part_name from PARTITIONS as P inner join SDS as S on P.SD_ID = S.SD_ID where P.TBL_ID = 472;
+---------+-------------------------------------------------------------------------+--------+--------------------------------------+
| part_id | location | tbl_id | part_name |
+---------+-------------------------------------------------------------------------+--------+--------------------------------------+
| 7 | hdfs://hostname:8020/tmp/multi_part/2011/01/2011-01 | 472 | year=2011/month=1/year_month=2011-01 |
| 9 | hdfs://hostname:8020/tmp/multi_part/2012/01/2012-01 | 472 | year=2012/month=1/year_month=2012-01 |
+---------+-------------------------------------------------------------------------+--------+--------------------------------------+
2 rows in set (0.00 sec)
The location from both the queries will pull data from same hdfs directory.
The only difference in speed will be from the metastore DB query that is already explained in Dudu's answer.

In Putty, how do you locate where an error is based on the char position?

I received the following error, which is based off of my code (attached below) in Netezza .
^ found "WHERE" (at char 543) expecting an identifier found a keyword
But how I find where in the code this is located at? Do I need to divide 543 by the amount of characters per line ?
the code is shown below :
DELETE FROM TDM_FEE_DISCOUNT_FACT;;
----------------------------------------------------------------------------
-- INSERT INTO TDM TABLE ---
----------------------------------------------------------------------------
INSERT INTO FEE_DISCOUNT_FACT
(
FEE_DISCNT_F_DK ,
HLTH_PLN_GRP_DK,
HLTH_PLN_SPSR_DK,
PLN_MBR_DK,
COV_PRD_STRT_DT,
COV_PRD_END_DT,
BIL_DUE_DT,
FEE_AMT,
DISCNT_AMT,
BIL_ID,
SS_CD,
TNT_CD,
INSRT_DT,
UPDT_DT,
CREAT_RUN_CYC_EXEC_SK,
LST_UPDT_RUN_CYC_EXEC_SK,
REC_PRCS_TYP_CD,
ROW_EFF_STRT_DT,
ROW_EFF_END_DT,
CUR_ROW_IND
)
WITH LAST_RUN_DATE(DT) AS
(
SELECT NVL(LAST_RUN, TO_DATE('01/01/1900', 'MM/DD/YYYY'))
FROM (
SELECT MAX(NVL(TCT.MANIFEST_COMPLETED_TS, TO_DATE('01/01/1900', 'MM/DD/YYYY'))) LAST_RUN
FROM :DB_XREF..TGT_CONTROL_TBL TCT
WHERE MANIFEST = 'F_FEE_DISCOUNT'
) T
)
,
--Need to modify the following section , on line 66 of FEE_DISCOUNT_FACT
DRIVER AS
(
select
PM.HLTH_PLN_GRP_NBR||'|'||SBFD.SS_CD AS HLTH_PLN_GRP_BK
,PM.HLTH_PLN_GRP_NBR AS SPSR_ID
,PM.PLN_MBR_SK
--COV_PRD_STRT_DT
--COV_PRD_END_DT
,SBFD.BIL_DUE_DT AS BIL_DUE_DT
,NVL(SBFD.FEE_AMT,0) AS FEE_AMT
,NVL(SBFD.DISCNT_AMT,0) AS DISCNT_AMT
,SBFD.BIL_ID
,SBFD.SS_CD
,SBFD.FEE_DISCNT_CREAT_DT_TM
from MBRBOR_TGT_D4..SUBSCRIBER_BILLING_FEE_DISCOUNT SBFD
LEFT OUTER JOIN MBRBOR_TGT_D4..PLAN_MEMBER PM
ON PM.SBSCR_SK = SBFD.SBSCR_SK
AND PM.CUR_ROW_IND = 'Y'
AND PM.REC_PRCS_TYP_CD <> 'D'
AND PM.SBSCR_DPND_RLNSP_TYP_CD_SK IN (SELECT CD_MAP_SK FROM REFBOR_TGT_D4..CODEMAP
WHERE CONFOR_CD = 'SUB')
)
/* IN the code, only need to join to the dimensions .. FACT DK's are the sequence */
SELECT NEXT VALUE FOR FEE_DISCOUNT_F_SEQ AS FEE_DISCNT_F_DK,
NVL(HPGD.HLTH_PLN_GRP_DK,-9) AS HLTH_PLN_GRP_DK,
NVL(HPGD.HLTH_PLN_SPSR_DK,-9) AS HLTH_PLN_SPSR_DK,
PM.PLN_MBR_SK AS PLN_MBR_DK,
NVL(??.COV_PRD_STRT_DT,-9) AS COV_PRD_STRT_DT,
NVL(??.COV_PRD_END_DT,-9) AS COV_PRD_END_DT,
NVL(BIL_DUE_DT,0) AS BIL_DUE_DT,
NVL(FEE_AMT, 0) AS FEE_AMT,
NVL(DISCNT_AMT, 0) AS DISCNT_AMT,
NVL(BIL_ID,-9) AS BIL_ID,
DR.SS_CD AS SS_CD,
'BSC' AS TNT_CD,
NOW() AS INSRT_DT,
NOW() AS UPDT_DT,
NVL(SBFD.CREAT_RUN_CYC_EXEC_SK,-9) AS CREAT_RUN_CYC_EXEC_SK,
NVL(SBFD.LST_UPDT_RUN_CYC_EXEC_SK,-9) AS LST_UPDT_RUN_CYC_EXEC_SK,
NVL(SBFD.REC_PRCS_TYP_CD,0) AS REC_PRCS_TYP_CD,
NVL(SBFD.ROW_EFF_STRT_DT,0) AS ROW_EFF_STRT_DT,
NVL(SBFD.ROW_EFF_END_DT,0) AS ROW_EFF_END_DT,
NVL(SBFD.CUR_ROW_IND,0) AS CUR_ROW_IND --FEE_DISCNT_CREAT_DT_TM
FROM DRIVER DR
LEFT OUTER JOIN :DB_TGT..HEALTH_PLAN_GROUP_DIMENSION HPGD ON DR.HLTH_PLN_GRP_BK = HPGD.HLTH_PLN_GRP_BK
AND FEE_DISCNT_CREAT_DT_TM BETWEEN HPGD.ROW_EFF_STRT_DT AND HPGD.ROW_EFF_END_DT
LEFT OUTER JOIN :DB_TGT..HEALTH_PLAN_SPONSOR_DIMENSION HPSD ON DR.SPSR_ID = HPSD.SPSR_ID
AND FEE_DISCNT_CREAT_DT_TM BETWEEN HPSD.ROW_EFF_STRT_DT AND HPSD.ROW_EFF_END_DT --DONE
--some of the joins will have different conditions . like for SS_CD it will be different
LEFT OUTER JOIN :DB_TGT..PLAN_MEMBER_DIMENSION PMD ON DR.PLN_MBR_SK = PMD.PLN_MBR_SK
AND FEE_DISCNT_CREAT_DT_TM BETWEEN PMD.ROW_EFF_STRT_DT AND PMD.ROW_EFF_END_DT
--LEFT OUTER JOIN
LEFT OUTER JOIN
(SELECT SBSCR.PLN_MBR_DK AS SBSCR_DK, PM.*
FROM :DB_TGT..PLAN_MEMBER_DIMENSION PM
LEFT OUTER JOIN
(SELECT PM.PLN_MBR_DK, PM.PLN_MBR_SK, PM.SBSCR_SK, PM.CUR_ROW_IND, PM.ROW_EFF_STRT_DT, PM.ROW_EFF_END_DT FROM
:DB_TGT..PLAN_MEMBER_DIMENSION PM WHERE PLN_MBR_SK = SBSCR_SK) SBSCR
ON PM.SBSCR_SK = SBSCR.SBSCR_SK AND (PM.ROW_EFF_END_DT -1) BETWEEN
SBSCR.ROW_EFF_STRT_DT AND SBSCR.ROW_EFF_END_DT) PMD
ON DR.PLN_MBR_SK = PMD.PLN_MBR_SK
AND DR.CAPITN_ERN_FROM_DT BETWEEN PMD.ROW_EFF_STRT_DT AND PMD.ROW_EFF_END_DT
Thanks
Since vi is the editor used for the queries (if you're in nzsql) - you can navigate 543 characters from the start by typing in 0543l in the editor - this will take you the start, then 543 steps to the "right". This should bring you to the area of the sql where the problem is.

Oracle SQL subquery scalar as input to other subquery

In the below query, I'm using Subquery1 and Subquery2 to get Account Number and Account Name. However the first Subquery and second joins same tables except an additional table account_nameinfo_t in Subquery 2 to get the account name. Is there a way I avoid selecting from other tables and just use the value of Subquery 1 i.e account# to get the account name in Subquery 2?
SELECT
(
SELECT acct.account_no
FROM group_t grp1, account_t acct
WHERE grp1.poid_id0 = grpbm.obj_id0
AND acct.poid_id0 = grp1.ACCOUNT_OBJ_ID0
) PARENT_ACCOUNT, -- (#Subquery 1 to get the parent account)
(
SELECT ant.Firstname || ' ' || ant.LastName
FROM group_t grp1, account_t acct,account_nameinfo_t ant
WHERE grp1.poid_id0 = grpbm.obj_id0
AND acct.poid_id0 = grp1.ACCOUNT_OBJ_ID0
AND ant.obj_id0 = acct.poid_id0
) "ACCOUNT NAME", -- (#Subquery 2 to get the parent account name which is in a different table)
bgs.REC_ID2 RECORD_TYPE,
bgs.current_bal VALUE
FROM group_t grp,
group_billing_members_t grpbm,
BAL_GRP_SUB_BALS_T bgs
WHERE poid_type = '/group/sharing/discounts'
AND grpbm.OBJECT_ID0 = grp.ACCOUNT_OBJ_ID0
AND bgs.obj_id0 = grp.BAL_GRP_OBJ_ID0
AND bgs.rec_id2 NOT IN (1000203,
1030001,
1000303,
1000306)
ORDER BY PARENT_ACCOUNT;
It looks like you can simplify this using simple joins rather than subqueries, either in the select list or as inline views:
SELECT acct.account_no AS "PARENT ACCOUNT",
ant.first_name||' '||ant.last_name AS "ACCOUNT NAME",
bgs.rec_id2 AS record_type,
bgs.current_bal
FROM group_t grp
JOIN group_billing_members_t grpbm ON grpbm.obj_id0 = grp.account_obj_id0
JOIN group_t grp1 ON grp1.poid_id0 = grpbm.obj_id0
JOIN bal_grp_sub_bals_t bgs ON bgs.obj_id0 = grp.bal_grp_obj_id0
JOIN account_t acct ON acct.poid_id0 = grp1.account_obj_id0
JOIN account_nameinfo_t ant ON ant.obj_id0 = acct.poid_id0
WHERE grp.poid_type='/group/sharing/discounts'
AND bgs.rec_id2 not in (1000203, 1030001, 1000303, 1000306)
AND ant.rec_id = 1
ORDER BY "PARENT ACCOUNT";
You only seem to be using group_billing_members_t between two references to group_t, and it isn't clear if they both point to the same record, or if that expands to multiple rows. The column names seem a bit inconsistent, which may be from your retyping the code rather than copying and pasting it. If it is the same record then you seem to be able to remove that table and the rejoin:
SELECT acct.account_no AS "PARENT ACCOUNT",
ant.first_name||' '||ant.last_name AS "ACCOUNT NAME",
bgs.rec_id2 AS record_type,
bgs.current_bal
FROM group_t grp
JOIN bal_grp_sub_bals_t bgs ON bgs.obj_id0 = grp.bal_grp_obj_id0
JOIN account_t acct ON acct.poid_id0 = grp.account_obj_id0
JOIN account_nameinfo_t ant ON ant.obj_id0 = acct.poid_id0
WHERE grp.poid_type='/group/sharing/discounts'
AND bgs.rec_id2 not in (1000203, 1030001, 1000303, 1000306)
AND ant.rec_id = 1
ORDER BY "PARENT ACCOUNT";
Without table structures, relationships, sample data and expected results that's rather speculative though.
Oracle supports a WITH clause which you may find useful: http://psoug.org/reference/with.html
Essentially, it allows you to create a temporary view within a query that can be accessed multiple times. In your case, the result of your common join can be "factored out" and the result can be reused.
As ALEXPOOL suggested here is what i tried with ANSI joins and it works. Any betterment is welcome.
SELECT C1."PARENT ACCOUNT",C1."ACCOUNT NAME", A1.RECORD_TYPE, A1.CURRENT_BAL
FROM
(SELECT
bgs.REC_ID2 RECORD_TYPE,
bgs.current_bal,grpbm.OBJ_ID0
FROM group_t grp ,
group_billing_members_t grpbm,
BAL_GRP_SUB_BALS_T bgs
WHERE
poid_type='/group/sharing/discounts' and
grpbm.OBJECT_ID0 = grp.ACCOUNT_OBJ_ID0 and
bgs.obj_id0 = grp.BAL_GRP_OBJ_ID0 and
bgs.rec_id2 not in ( 1000203,
1030001,
1000303,
1000306) ) A1 JOIN
( SELECT grp1.ACCOUNT_OBJ_ID0,grp1.POID_ID0
FROM group_t grp1
) B1 ON (B1.poid_id0 = A1.OBJ_ID0)
JOIN
( SELECT acct.account_no "PARENT ACCOUNT",
ant.FIRST_NAME||' '||ant.LAST_NAME "ACCOUNT NAME",
acct.poid_id0
FROM
account_t acct,
account_nameinfo_t ant
WHERE acct.poid_id0 = ant.obj_id0 and
ant.rec_id=1) C1 ON (C1.poid_id0 = B1.ACCOUNT_OBJ_ID0)
order by C1."PARENT ACCOUNT";

another union all selecting all the rows that have not already been selected

Right now i have 2 select statements that are joined by a union what i was hopping to do was maybe name the first query like query1 and the second one query2 and then in my third query do a where bookno not in query1 or query2.
SELECT distinct t0.BOOKNO, t0.PaxName, t0.Locator, t0.FDATE7,
t0.BOARD, t0.ALIGHT, t0.AIRLINE, t0.FNUMBR, t0.DEP,
t0.ARR, t0.TOUR, t0.ROUTE,
t1.tour, t1.route, t1.sfrom , t1.sto,t1.seq,t0.seq, 'yes'
FROM
( SELECT TOP (100) PERCENT test.dbo.BNAMES.BOOKNO, RTRIM(test.dbo.BNAMES.SRNAME) + '/' + RTRIM(test.dbo.BNAMES.FIRST) + RTRIM(test.dbo.BNAMES.TITLE)
AS PaxName, test.dbo.PNRS.PNR AS Locator, test.dbo.PNRSECTORS.FDATE7, test.dbo.PNRSECTORS.BOARD, test.dbo.PNRSECTORS.ALIGHT,
test.dbo.PNRSECTORS.AIRLINE, test.dbo.PNRSECTORS.FNUMBR, test.dbo.PNRSECTORS.DEP, test.dbo.PNRSECTORS.ARR, test.dbo.BOOKINGS.TOUR,
test.dbo.BOOKINGS.ROUTE, test.dbo.BSTAGES.SEQ,(test.dbo.PNRSECTORS.BOARD + test.dbo.PNRSECTORS.ALIGHT) as both
FROM test.dbo.BOOKINGS LEFT OUTER JOIN
test.dbo.BNAMES ON test.dbo.BOOKINGS.BOOKNO = test.dbo.BNAMES.BOOKNO LEFT OUTER JOIN
test.dbo.BSTAGES ON test.dbo.BNAMES.BOOKNO = test.dbo.BSTAGES.BOOKNO LEFT OUTER JOIN
test.dbo.PNRSECTORS ON test.dbo.BSTAGES.SCODE = test.dbo.PNRSECTORS.SKEY LEFT OUTER JOIN
test.dbo.PNRS ON test.dbo.PNRSECTORS.PNRKEY = test.dbo.PNRS.PNRKEY
WHERE (test.dbo.BSTAGES.STYPE = 3)
ORDER BY test.dbo.BOOKINGS.BOOKNO, test.dbo.BNAMES.SEQ, locator
) t0
INNER JOIN ( SELECT TOUR, ROUTE, OFFSET, SEQ, SCODE, SFROM, STO, (SFROM + STO) AS BOTH
FROM test.dbo.TSTAGES
) t1 ON t1.tour = t0.tour and t1.route = t0.route and (t0.both = t1.both)
union all
SELECT distinct t0.BOOKNO, t0.PaxName, t0.Locator, t0.FDATE7,
t0.BOARD, t0.ALIGHT, t0.AIRLINE, t0.FNUMBR, t0.DEP,
t0.ARR, t0.TOUR, t0.ROUTE,
t1.tour, t1.route, t1.sfrom , t1.sto,t1.seq,t0.seq,'YES'
FROM
( SELECT TOP (100) PERCENT test.dbo.BNAMES.BOOKNO, RTRIM(test.dbo.BNAMES.SRNAME) + '/' + RTRIM(test.dbo.BNAMES.FIRST) + RTRIM(test.dbo.BNAMES.TITLE)
AS PaxName, test.dbo.PNRS.PNR AS Locator, test.dbo.PNRSECTORS.FDATE7, test.dbo.PNRSECTORS.BOARD, test.dbo.PNRSECTORS.ALIGHT,
test.dbo.PNRSECTORS.AIRLINE, test.dbo.PNRSECTORS.FNUMBR, test.dbo.PNRSECTORS.DEP, test.dbo.PNRSECTORS.ARR, test.dbo.BOOKINGS.TOUR,
test.dbo.BOOKINGS.ROUTE, test.dbo.BSTAGES.SEQ,(test.dbo.PNRSECTORS.BOARD + test.dbo.PNRSECTORS.ALIGHT) as both
FROM test.dbo.BOOKINGS LEFT OUTER JOIN
test.dbo.BNAMES ON test.dbo.BOOKINGS.BOOKNO = test.dbo.BNAMES.BOOKNO LEFT OUTER JOIN
test.dbo.BSTAGES ON test.dbo.BNAMES.BOOKNO = test.dbo.BSTAGES.BOOKNO LEFT OUTER JOIN
test.dbo.PNRSECTORS ON test.dbo.BSTAGES.SCODE = test.dbo.PNRSECTORS.SKEY LEFT OUTER JOIN
test.dbo.PNRS ON test.dbo.PNRSECTORS.PNRKEY = test.dbo.PNRS.PNRKEY
WHERE (test.dbo.BSTAGES.STYPE = 1)
ORDER BY test.dbo.BOOKINGS.BOOKNO, test.dbo.BNAMES.SEQ, locator
) t0
INNER JOIN ( SELECT TOUR, ROUTE, OFFSET, SEQ, SCODE, SFROM, STO, (SFROM + STO) AS BOTH
FROM test.dbo.TSTAGES
) t1 ON t1.tour = t0.tour and t1.route = t0.route and t1.seq = t0.seq and (t0.both = t1.both)
order by bookno
END
How about using WITH? You can declare you queries, join them with UNION and them search for the ones not there.
Take a look here: Multiple Select Statements using SQL Server 2005 "WITH" Statement . It should help you get started.
By using WITH statement, you will isolate logic of your queries, making your overall query more understandable.
just wrap your logic around what you wrote:
select bookno
where key not in (
your big select statement...
)