{SAS} Converting a SQL merge in to HASH merge

{SAS} Converting a SQL merge in to HASH merge - sql

proc sql;
create table ndd1 as
select a.*, 1 as default_flag, b.retail_account_no, c.limit
from (
select
posting_date, counterparty_id, counterparty_indicator, customer_id, last_default_date, last_out_of_default_date
from gdwh30.tb0_default
where counterparty_id<>'*noval*'
and ((last_out_of_default_date < posting_date and last_default_date >= last_out_of_default_date)
or (last_out_of_default_date > posting_date and last_default_date < last_out_of_default_date))
and posting_date = to_date(%nrbquote(')&tt.%nrbquote('), 'DD-MON-YYYY hh24:mi')) a
left join (
select retail_account_no, REF_account_ID, regexp_substr(REF_account_ID,'[^*]+', 1) as counterparty_id
from gdwh30.TB0_account
/*tb_account*/
where bus_date_from<= to_date(%nrbquote(')&tt.%nrbquote('), 'DD-MON-YYYY hh24:mi')
and bus_date_until>=to_date(%nrbquote(')&tt.%nrbquote('), 'DD-MON-YYYY hh24:mi')
and entity_id='RBBG') b
on a.counterparty_id=b.counterparty_id
;quit;
&tt. is of the form
30APR2020:00:00:00
or date22.
data ndd_1;
if 0 then set gdwh30.tb0_default today ;
if _n_=1 then do;
declare hash k(dataset:"gdwh30.tb0_default");
k.definekey ("posting_date");
k.definedata ("counterparty_id", "posting_date", "last_out_of_default_date", "last_default_date");
k.definedone();
declare hash j(dataset:"today");
j.definekey("today1");
j.definedata ("today1");
j.definedone();
end;
set ndd_1;
if k.find(key:posting_date)=0 and j.find(key:today1)=0 then output;
run;
What I think is must do is format the columns pre N=1;
For the second join on Tb0_acc I want to attempt a fuzzy merger so any help will be greatly appreciated.
Today1 is &tt. but in a table instead of a list. I first try to join the A part just to get a feel for these hash joins.
Posting_date is of the form 31AUG2016:00:00:00

Related

Hash join equivalent on PROC SQL between

I usually use PROC SQL for when I'm joining a table on that also has a date condition (i.e. target_date falls between start_date and end_date).
I've been able to successfully translate this to a hash join when considering an INNER JOIN:
data hash_join;
if _n_ = 1 then do;
declare hash add1(dataset:'table_2',multidata: 'Y');
add1.defineKey('key_1');
add1.defineData('start_date','end_date','value_1');
add1.defineDone();
end;
format
start_date date9.
end_date date9.
value_1 10.5
;
set table_1 (keep=key_1 target_date);
if add1.find() = 0 then do until (add1.find_next());
if start_date le target_date le end_date then output;
end;
run;
Which is the same thing as:
proc sql;
create table sql_join as select
b.start_date,
b.end_date,
b.value_1,
a.key_1,
a.target_date
from table_1 a
inner join table_2 b
on a.key_1 = b.key_1 and
a.target_date between b.start_date and b.end_date
;quit;
I'm having trouble figuring out what the equivalent would be to a LEFT JOIN though. For instance, if something doesn't JOIN, I'd want to output, which I think is straightforward:
if add1.find() ne 0 then output;
And if it JOINs and the date is between, that seems straightforward as well:
if add1.find() = 0 then do until (add1.find_next());
if start_date le target_date le end_date then output;
end;
But how do I get the rest of the records from table_1 that might join, but don't have the target_date between the start_date and end_date? For instance, let's say table_2 is a start_date and end_date of a sale, and that sale didn't start until February 1st for a key_1 = 'Clothes'. If my table_1 has 'Clothes' and sales on January 1st, it will JOIN on the key, but I want to output the blank value. Any ideas on how to do this?
Any help would be much appreciated!

You just need to keep track of whether you've found a match or not. Since you're not using the hash find to track the 'between' part of things, you can't use that, so you just have to do it yourself.
See this example. Here I modify SASHELP.CLASS to look like your input tables, then add a bit of logic to see if anything was found.
data table_1;
set sashelp.class;
rename age=target_date name=key_1;
drop height weight;
run;
data table_2;
set sashelp.class;
do _i = 1 to mod(_n_,3);
start_date = age-3+_i;
end_date = age+1-_i;
if start_date le end_date then output;
end;
rename name=key_1 height=value_1;
keep height weight start_date age end_date name;
run;
data hash_join;
if _n_ = 1 then do;
declare hash add1(dataset:'table_2',multidata: 'Y');
add1.defineKey('key_1');
add1.defineData('start_date','end_date','value_1');
add1.defineDone();
end;
format
start_date date9.
end_date date9.
value_1 10.5
;
set table_1 (keep=key_1 target_date);
if add1.find() = 0 then do until (add1.find_next());
if start_date le target_date le end_date then do;
found=1;
output;
end;
end;
call missing(of value_1); *full list of values to clear - all of hash data elements;
if not (found) then output;
run;

I think you just need to track if something has the key, but not in the range:
if add1.find() ^=0 then output;
else do;
found = 0;
do until (add1.find_next());
if start_date le target_date le end_date then do;
output;
found=1;
end;
end;
if ^found then output;
end;
No data to test with, so this is just me coding in SO. Let me know if it doesn't work.

Combine 2 sql queries into one with eliminating duplicate colums

I have written 2 sql queries by using multiple join statements.
Now I want to combine both the query results as below.
First query shows these columns
UDC_ID, EXT_ID, VALUE
The second query shows these columns
UDC_ID, EXT_ID, VALUE
In both the queries UDC_ID and EXT_ID columns are the same, but the VALUE column in each is different
So the final output I want to display is,
UDC_ID, EXT_ID, VALUE (From Query1), VALUE (from Query 2)
Can anyone suggest how this can be achieved?
These are my queries:
Query 1 joins three tables:
SELECT
DEV.UDC_ID,
SR.EXT_ID,
SRA.VALUE
FROM SERVICE_REQUEST SR
JOIN DEVICE DEV
ON SR.DEVICE_ID = DEV.ID
JOIN SERVICE_REQUEST_ATTR SRA
ON SR.ID = SERVICE_REQUEST_ID
WHERE SR.SUB_TYPE_CD = 'HMI_22'
--AND DEV.SUB_TYPE = 'ESME'
AND SRA.NAME = 'CommsHubGUID'
AND SR.INSERT_TIME >= TO_DATE('2016-09-21 00:00:00', 'YYYY-MM-DD HH24:MI:SS')
AND SR.INSERT_TIME <= TO_DATE('2016-09-28 00:00:00', 'YYYY-MM-DD HH24:MI:SS')
ORDER BY SR.INSERT_TIME DESC;
The difference between query 1 and this query is the where clause criterion for SRA.NAME field, otherwise both the queries are same.
SELECT
DEV.UDC_ID,
SR.EXT_ID,
SRA.VALUE
FROM SERVICE_REQUEST SR
JOIN DEVICE DEV
ON SR.DEVICE_ID = DEV.ID`enter code here`
JOIN SERVICE_REQUEST_ATTR SRA
ON SR.ID = SERVICE_REQUEST_ID
WHERE SR.SUB_TYPE_CD = 'HMI_22'
--AND DEV.SUB_TYPE = 'ESME'
AND SRA.NAME = 'Service Location'
AND SR.INSERT_TIME >= TO_DATE('2016-09-21 00:00:00', 'YYYY-MM-DD HH24:MI:SS')
AND SR.INSERT_TIME <= TO_DATE('2016-09-28 00:00:00', 'YYYY-MM-DD HH24:MI:SS')
ORDER BY SR.INSERT_TIME DESC

SELECT COALESCE(q1.UDC_ID,q2.UDC_ID),
COALESCE(q1.EXT_ID, q2.EXT_ID),
q1.VALUE ,
q2.VALUE
FROM (query 1) q1
FULL OUTER JOIN (query 2) q2 ON q1.UDC_ID=q2.UDC_ID and q1.EXT_ID=q2.EXT_ID

how to join 2 query into one in EXCEL ORACLE CONNECTION

I have 2 query.
I am trying to join them so I just write export from one instead of manually joining them in excel.
(SELECT
b.OUT_NO,
a.ACCNO,
a.BILL_ACCNO,
a.NAME,
a.HOUSE_NO,
a.STREET,
a.HOUSE_NO2,
a.ZIP,
a.ID,
b.TIME_STAMP,
b.REST_DATE,
c.RESTORED_TIME,
b.OUT_CMNT
FROM brook.account a,
brook.problem b,
brook.history c
WHERE c.OUT_NO = b.OUT_NO
AND a.ID = c.ID
AND ( (a.NAME Is Not Null)
AND (a.DISC Is Null)
AND (b.TIME_STAMP>?)
AND (c.RESTORED_TIME<?))
)
and
(SELECT
b.OUT_NO,
a.ACCNO,
a.BILL_ACCNO,
a.NAME,
a.HOUSE_NO,
a.STREET,
a.HOUSE_NO2,
a.ZIP,
a.ID,
b.TIME_STAMP,
b.REST_DATE,
c.RESTORED_TIME,
b.OUT_CMNT
FROM brook.account a,
brook.problem b,
brook.history c
WHERE c.OUTAGE_NO = b.OUTAGE_NO
AND a.ID = c.ID
AND ( (a.NAME Is Not Null)
AND (a.DISC Is Null)
AND (b.TIME_STAMP > ? And b.TIME_STAMP < ?)
AND (c.RESTORED_TIME > ? And c.RESTORED_TIME < ?)
)
)
How can I join these 2? into 1, I tried UNION ALL but I get ora-01847 day of month must be between 1 and last day of month ERROR.
? are the parameter, it is linked to cells on spreadsheet.
format of excel data parameter. 11/04/2013 00:00:00
Thanks

Error is about a date format, not about union.
If you pass cell values as string parameters Oracle tries to convert it to dates to comapre with columns of date or timestamp values in table columns. To do this conversion Oracle uses it's internal default date representation format wich is not mm/dd/yyyy hh24:mi:ss in your case.
There are 2 possibilities to fix a situation:
Pass parameters with date type to query and convert values to dates before passing it to Oracle. Check examples on MSDN and description of CreateParameter and Parameters.Append methods.
Convert values to dates in query with to_date Oracle function.
Change conditions in query from
AND (b.TIME_STAMP>?)
AND (c.RESTORED_TIME<?))
and
AND (b.TIME_STAMP > ? And b.TIME_STAMP < ?)
AND (c.RESTORED_TIME > ? And c.RESTORED_TIME < ?)
to
AND (b.TIME_STAMP > to_date(?,'mm/dd/yyyy hh24:mi:ss') )
AND (c.RESTORED_TIME < to_date(?,'mm/dd/yyyy hh24:mi:ss') ))
and
AND (
b.TIME_STAMP > to_date(?,'mm/dd/yyyy hh24:mi:ss')
And
b.TIME_STAMP < to_date(?,'mm/dd/yyyy hh24:mi:ss')
)
AND (
c.RESTORED_TIME > to_date(?,'mm/dd/yyyy hh24:mi:ss')
And
c.RESTORED_TIME < to_date(?,'mm/dd/yyyy hh24:mi:ss')
)

Sql query for following scenario

I have following scenario, for which i need to write sql query.
I have ICCID table and ICCID property table which holds following information.
I want to find out all active iccids and iccid's which are in removed state in month of december 2012.for ICCIDs which are in removed state, date.to.change key in the ICCID property table itself which record the removed date of ICCID.
this is my attempt, but that did not worked
select e.ID_ICCID from ICCID_PROPERTY e where
e.c_key ='STATE' and e.c_value='Active' or(
e.c_key ='STATE' and
e.c_value='Removed' and
e.c_key='date.to.change' and
to_date(e.c_value,'yyyymmdd') >=to_date('2012-DEC-01 00:00:00', 'YYYY-MON-DD HH24:MI:SS') and
to_date(e.c_value,'yyyymmdd') <= to_date('2012-DEC-31 23:59:59', 'YYYY-MON-DD HH24:MI:SS')
))
Thanks in advance for any help

This is one of the issues with a key-value pair design such as this...
You can't just check a single property row to see if it matches the search criteria, since the criteria in this case will span multiple properties... you have to check if a single parent row has all the children properties that match:
SELECT
i.ICCID
FROM
ICCID i
WHERE
EXISTS (
SELECT 1
FROM ICCID_PROPERTY ip
WHERE
ip.ID_ICCID = i.ID_ICCID
AND ip.c_key = 'STATE'
AND ip.c_value = 'Active'
) OR (
EXISTS (
SELECT 1
FROM ICCID_PROPERTY ip
WHERE
ip.ID_ICCID = i.ID_ICCID
AND ip.c_key = 'STATE'
AND ip.c_value = 'Removed'
) AND
EXISTS (
SELECT 1
FROM ICCID_PROPERTY ip
WHERE
ip.ID_ICCID = i.ID_ICCID
AND ip.c_key = 'date.to.change'
AND to_date(ip.c_value,'yyyymmdd') >=
to_date('2012-DEC-01 00:00:00', 'YYYY-MON-DD HH24:MI:SS')
AND to_date(ip.c_value,'yyyymmdd') <=
to_date('2012-DEC-31 23:59:59', 'YYYY-MON-DD HH24:MI:SS')
)
)

I think you could join the Property table three times -- maybe something like this (untested):
SELECT I.ID_ICCID
FROM ICCID I
JOIN ICCID_Property IP ON I.ID_ICCID = IP.ID_ICCID AND IP.C_Key = 'STATE' AND IP.C_Value = 'Active'
JOIN ICCID_Property IP2 ON I.ID_ICCID = IP.ID_ICCID AND IP2.C_Key = 'STATE' AND IP2.C_Value= 'Removed'
JOIN ICCID_Property IP3 ON I.ID_ICCID = IP.ID_ICCID AND IP3.C_Key = 'date.to.change' AND to_date(IP3.C_Value,'yyyymmdd') >= to_date('2012-DEC-01 00:00:00', 'YYYY-MON-DD HH24:MI:SS')
AND to_date(IP3.C_Value,'yyyymmdd') <=
to_date('2012-DEC-31 23:59:59', 'YYYY-MON-DD HH24:MI:SS')
Good luck.

Comparing Two Sets of Date Ranges in SQL

I have two sets of data with different date ranges.
Tbl 1:
ID, Date_Start, Date_End
1, 2010-01-01, 2010-01-09
1, 2010-01-10, 2010-01-19
1, 2010-01-30, 2010-01-31
Tbl 2:
ID, Date_Start, Date_End
1, 2010-01-01, 2010-01-04
1, 2010-01-08, 2010-01-17
1, 2010-01-30, 2010-01-31
I'd like to find cases date ranges do not entirely overlap date ranges in Tbl 2. So for instance, in this example, I'd like output that looks something like this --
Output:
ID, Gap_Start, Gap_End
1, 2010-01-05, 2010-01-07
1, 2010-01-18, 2010-01-19
Date ranges will never overlap within a table. To do this, I'm using either DB2 SQL or SAS. Unfortunately, the datasets are big enough (millions of records) that I can't just brute force it.
Thank you!

Following on from Jon of All Trades' approach, this is a more completed solution. The crucial features are:
Use an auxiliary calendar table, which is just a list of all dates.
From the calendar table, JOIN to Tbl1 to get a list of dates which are in range.
Also do an anti-JOIN to Tbl2 to get only the dates which aren't in Tbl2's ranges.
I've enclosed those results in a Common Table Expression (CTE) called OutDates.
Define another CTE based on OutDates to get just the dates which start a gap; call this EarliestDates.
Define another CTE based on OutDates to get just the dates which end a gap; call this LatestDates.
JOIN EarliestDates and LatestDates to put each gap into a single row.
WITH
OutDates(ID, dt) AS
( SELECT Tbl1.ID, Calendar.dt FROM Calendar
INNER JOIN Tbl1 ON Calendar.dt BETWEEN Tbl1.Date_Start AND Tbl1.Date_End
LEFT OUTER JOIN Tbl2 ON Calendar.dt BETWEEN Tbl2.Date_Start AND Tbl2.Date_End
WHERE Tbl2.ID IS NULL
)
,
EarliestDates AS
( SELECT earliest.ID, earliest.dt FROM OutDates earliest
LEFT OUTER JOIN OutDates nonesuch_earlier ON DateAdd(day, -1, earliest.dt) = nonesuch_earlier.dt
WHERE nonesuch_earlier.ID IS NULL
)
,
LatestDates AS
( SELECT latest.ID, latest.dt FROM OutDates latest
LEFT OUTER JOIN OutDates nonesuch_later ON DATEADD(day, 1, latest.dt) = nonesuch_later.dt
WHERE nonesuch_later.ID IS NULL
)
SELECT rangestart.ID, rangestart.dt AS Gap_Start, rangeend.dt AS Gap_End
FROM EarliestDates rangestart JOIN LatestDates rangeend
ON rangestart.dt <= rangeend.dt
LEFT OUTER JOIN EarliestDates nonesuch_inner1
ON nonesuch_inner1.dt <= rangeend.dt AND nonesuch_inner1.dt > rangestart.dt
LEFT OUTER JOIN LatestDates nonesuch_inner2
ON nonesuch_inner2.dt >= rangestart.dt AND nonesuch_inner2.dt < rangeend.dt
WHERE nonesuch_inner1.dt IS NULL AND nonesuch_inner2.dt IS NULL
This is a working implementation using Sql Server syntax for the common table expressions, but it should be easy to convert to DB2 syntax. I don't know how well it well scale to be honest, I've only tested it with a very small dataset.

I don't think there is the efficient and general solution for all the cases. Under certain circumstances, however, we can figure out some efficient ones. For instance, below assumes that: (1) datasets one and two have the same set of ids in the same order; and (2) there are relatively short possible date ranges (assumed here to be all the dates in the year of 2010 only). Notice that one input range may generate two gaps.
/* test data */
data one;
input id1 (start1 finish1) (:anydtdte.);
format start1 finish1 e8601da.;
cards;
1 2010-01-01 2010-01-09
1 2010-01-10 2010-01-19
1 2010-01-30 2010-01-31
2 2010-01-02 2010-01-10
;
run;
data two;
input id2 (start2 finish2) (:anydtdte.);
format start2 finish2 e8601da.;
cards;
1 2010-01-01 2010-01-04
1 2010-01-08 2010-01-17
1 2010-01-30 2010-01-31
2 2010-01-05 2010-01-06
;
run;
/* assumptions:
(1) datasets one and two have the same set of ids in the same
sorted order;
(2) only possible dates are in the year of 2010
*/
%let minDate = %sysevalf('01jan2010'd - 1);
%let maxDate = %sysevalf('31dec2010'd + 1);
data gaps;
array inRange[&minDate:&maxDate] _temporary_;
array covered[&minDate:&maxDate] _temporary_;
do i = &minDate to &maxDate; inRange[i] = 0; covered[i] = 0; end;
do until (last.id1);
set one;
by id1;
do i = start1 to finish1; inRange[i] = 1; end;
end;
do until (last.id2);
set two;
by id2;
do i = start2 to finish2; covered[i] = 1; end;
end;
format startGap finishGap e8601da.;
startGap = .;
finishGap = .;
do i = &minDate+1 to &maxDate;
if inRange[i] and not covered[i] and missing(startGap) then startGap = i;
if (covered[i] or not inRange[i]) and not missing(startGap) and not covered[i-1] then do;
finishGap = i - 1;
output;
call missing(startGap, finishGap);
keep id1 startGap finishGap;
end;
end;
run;
/* check */
proc print data=gaps noobs;
run;
/* on lst
id1 startGap finishGap
1 2010-01-05 2010-01-07
1 2010-01-18 2010-01-19
2 2010-01-02 2010-01-04
2 2010-01-07 2010-01-10
*/

This is not a complete solution, as it returns a list of dates rather than ranges, but maybe it will be of use:
SELECT
R1.ID, D.Date
FROM
#Ranges1 AS R1
INNER JOIN Dates AS D ON D.Date BETWEEN R1.StartDate AND R1.EndDate
EXCEPT
SELECT
R2.ID, D.Date
FROM
#Ranges2 AS R2
INNER JOIN Dates AS D ON D.Date BETWEEN R2.StartDate AND R2.EndDate
Note that this solution requires a dates table: a table with one record per day, for all the dates you're likely to use. It has the advantages of being succinct, and handling overlapping date ranges (not necessary in your case, but maybe for the next guy).

For what it's worth, this is the method I ended up using. I think you could do it in pure SQL, but it got horrifically ugly and difficult to debug.
Step 1 -- I consolidated the date ranges in both datasets. This means that something like
ID, Start_Date, End_Date
1, 2010-01-01, 2010-01-31
1, 2010-02-01, 2010-02-28
got transformed into this --
ID, Start_Date, End_Date
1, 2010-01-01, 2010-02-28.
The query I used to produce this was --
WITH Cte_recomb (Id, Start_date, End_date, Hopcount) AS
(SELECT Id,
Start_date,
End_date,
1 AS Hopcount
FROM Table1
UNION ALL
SELECT Cte_recomb.Id,
Cte_recomb.Start_date,
Table1.End_date,
(Recomb.Hopcount + 1) AS Hopcount
FROM Cte_recomb, Table1
WHERE (Cte_recomb.Id = Table1.Id) AND
(Cte_recomb.End_date + 1 day = Table1.Start_date)),
Cte_maxenddate AS
(SELECT Id,
Start_date,
Max (End_date) AS End_date
FROM Cte_recomb
GROUP BY Id, Start_date
ORDER BY Id, Start_date)
SELECT Maxend.*
FROM Cte_maxenddate AS Maxend
LEFT JOIN
Cte_recomb AS Nextrec
ON (Nextrec.Id = Maxend.Id) AND
(Nextrec.Start_date < Maxend.Start_date) AND
(Nextrec.End_date >= Maxend.End_date)
WHERE Nextrec.Id IS NULL;
Step 2 --
I produced another dataset that created a record for every overlap between the two datasets. You'll need an additional step to find cases where a given record in Table1 doesn't have a corresponding record in Table2 at all.
SELECT Table1.Id,
Table1.Start_date AS Table1_start_date,
Table1.End_date AS Table1_end_date,
Table2.Start_date AS Table2_start_date,
Table2.End_date AS Table2_end_date
FROM Table1
INNER JOIN
Table2
ON (Table1.Plcy_id_sk = Id) AND
( (Table1.Start_date BETWEEN Table2.Start_date AND Table2.End_date) OR
(Table2.Start_date BETWEEN Table1.Start_date AND Table1.End_date)) AND
( (Table1.Start_date <> Table2.Start_date) OR
(Table1.End_date <> Table2.End_date))
ORDER BY Table1.Id, Table1.Start_date, Table2.Start_date;
Step 3 --
I take the above dataset, and run the following SAS job. I tried to do this in pure SQL with recursive queries, but it got uglier and uglier every time I looked at it.
Data Table1_Gaps;
Set Table1_Compare;
By ID Table1_Start_Date Table2_Start_Date;
format Gap_Start_Date yymmdd10.;
format Gap_End_Date yymmdd10.;
format Old_Start_Date yymmdd10.;
format Old_End_Date yymmdd10.;
Retain Old_Start_Date Old_End_Date;
IF (Table2_End_Date = .) then do;
Gap_Start_Date = Table1_Start_Date;
Gap_End_Date = Table1_End_Date;
output;
end;
else do;
If (Table2_Start_Date > Table1_Start_Date) then do;
if first.Table1_Start_Date then do;
Gap_Start_Date = Table1_Start_Date;
Gap_End_Date = Table2_Start_Date - 1;
output;
end;
else do;
Gap_Start_Date = Old_End_Date + 1;
Gap_End_Date = Table2_Start_Date - 1;
output;
end;
end;
If (Table2_End_Date < Table1_End_Date) then do;
if Last.Table1_Start_Date then do;
Gap_Start_Date = Table2_End_Date + 1;
Gap_End_Date = Table1_End_Date;
output;
end;
end;
end;
Old_Start_Date = Table2_Start_Date;
Old_End_Date = Table2_End_Date;
drop Old_Start_Date Old_End_Date;
run;
I haven't verified it entirely yet, but this approach does seem to have given me the results I wanted. Any thoughts?

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

{SAS} Converting a SQL merge in to HASH merge - sql

Related

Hash join equivalent on PROC SQL between

Combine 2 sql queries into one with eliminating duplicate colums

how to join 2 query into one in EXCEL ORACLE CONNECTION

Sql query for following scenario

Comparing Two Sets of Date Ranges in SQL

Categories

Resources