How to rewrite nested subqueries so that hive can run them - hive

select
cd_gender,
cd_marital_status,
cd_education_status,
count(*) cnt1,
cd_purchase_estimate,
count(*) cnt2,
cd_credit_rating,
count(*) cnt3,
cd_dep_count,
count(*) cnt4,
cd_dep_employed_count,
count(*) cnt5,
cd_dep_college_count,
count(*) cnt6
from
customer c,customer_address ca,customer_demographics
where
c.c_current_addr_sk = ca.ca_address_sk and
ca_county in ('Greer County','Boone County','Cumberland County','Tyler County','Marion County') and
cd_demo_sk = c.c_current_cdemo_sk and
exists (select *
from store_sales,date_dim
where c.c_customer_sk = ss_customer_sk and
ss_sold_date_sk = d_date_sk and
d_year = 1999 and
d_moy between 1 and 1+3) and
(exists (select *
from web_sales,date_dim
where c.c_customer_sk = ws_bill_customer_sk and
ws_sold_date_sk = d_date_sk and
d_year = 1999 and
d_moy between 1 ANd 1+3) or
exists (select *
from catalog_sales,date_dim
where c.c_customer_sk = cs_ship_customer_sk and
cs_sold_date_sk = d_date_sk and
d_year = 1999 and
d_moy between 1 and 1+3))
group by cd_gender,
cd_marital_status,
cd_education_status,
cd_purchase_estimate,
cd_credit_rating,
cd_dep_count,
cd_dep_employed_count,
cd_dep_college_count
order by cd_gender,
cd_marital_status,
cd_education_status,
cd_purchase_estimate,
cd_credit_rating,
cd_dep_count,
cd_dep_employed_count,
cd_dep_college_count
limit 100;
When i run this query on hive it returns this error
"FAILED: SemanticException [Error 10249]: org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSubquerySemanticException: Line 23:2 Unsupported SubQuery Expression '3': Only SubQuery expressions that are top level conjuncts are allowed
"
This error occurs due to the second exists statement which contains a nested subquery.
Any ideas on how can i rewrite this query so it can work on hive?

You might try reordering the query to prevent having subqueries. The conditions are evaluated serially in SQL, so operator precedence shouldn't be a problem.
where
exists (select *
from web_sales,date_dim
where c.c_customer_sk = ws_bill_customer_sk and
ws_sold_date_sk = d_date_sk and
d_year = 1999 and
d_moy between 1 ANd 1+3) or
exists (select *
from catalog_sales,date_dim
where c.c_customer_sk = cs_ship_customer_sk and
cs_sold_date_sk = d_date_sk and
d_year = 1999 and
d_moy between 1 and 1+3) and
c.c_current_addr_sk = ca.ca_address_sk and
ca_county in ('Greer County','Boone County','Cumberland County','Tyler County','Marion County') and
cd_demo_sk = c.c_current_cdemo_sk and
exists (select *
from store_sales,date_dim
where c.c_customer_sk = ss_customer_sk and
ss_sold_date_sk = d_date_sk and
d_year = 1999 and
d_moy between 1 and 1+3)

Related

Hive TPCDS Query30 "Only SubQuery expressions that are top level conjuncts are allowed "

I am getting the above error when trying to run a tpcds query 30 in Hive. I did research and know this is not allowed in Hive so I am wondering how to rewrite this query. I directly got it from this website.
http://www.tpc.org/tpcds/default5.asp
Error: Error while compiling statement: FAILED: SemanticException Line 0:-1 Unsupported SubQuery Expression 'ctr_state': Only SubQuery expressions that are top level conjuncts are allowed
Query 30
with customer_total_return as
(select wr_returning_customer_sk as ctr_customer_sk
,ca_state as ctr_state,
sum(wr_return_amt) as ctr_total_return
from web_returns
,date_dim
,customer_address
where wr_returned_date_sk = d_date_sk
and d_year =2000
and wr_returning_addr_sk = ca_address_sk
group by wr_returning_customer_sk
,ca_state)
select c_customer_id,c_salutation,c_first_name,c_last_name,c_preferred_cust_flag
,c_birth_day,c_birth_month,c_birth_year,c_birth_country,c_login,c_email_address
,c_last_review_date_sk,ctr_total_return
from customer_total_return ctr1
,customer_address
,customer
where ctr1.ctr_total_return > (select avg(ctr_total_return)*1.2
from customer_total_return ctr2
where ctr1.ctr_state = ctr2.ctr_state)
and ca_address_sk = c_current_addr_sk
and ca_state = 'GA'
and ctr1.ctr_customer_sk = c_customer_sk
order by c_customer_id,c_salutation,c_first_name,c_last_name,c_preferred_cust_flag
,c_birth_day,c_birth_month,c_birth_year,c_birth_country,c_login,c_email_address
,c_last_review_date_sk,ctr_total_return
limit 100;
Update
Query 30 may have a typo when you generate the query using the tpcds suite. This does not exist in the customer table c_last_review_date_sk and you need to change it to c_last_review_date
Calculate avg(ctr_total_return) in the subquery customer_total_return using analytic function and remove subquery from the WHERE:
with customer_total_return as
(
select ctr_customer_sk, ctr_state, ctr_total_return,
avg(ctr_total_return) over(partition by ctr_state ) as ctr_state_avg
from
(select wr_returning_customer_sk as ctr_customer_sk
,ca_state as ctr_state,
sum(wr_return_amt) as ctr_total_return
from web_returns
,date_dim
,customer_address
where wr_returned_date_sk = d_date_sk
and d_year =2000
and wr_returning_addr_sk = ca_address_sk
group by wr_returning_customer_sk
,ca_state
) s
)
select c_customer_id,c_salutation,c_first_name,c_last_name,c_preferred_cust_flag
,c_birth_day,c_birth_month,c_birth_year,c_birth_country,c_login,c_email_address
,c_last_review_date_sk,ctr_total_return
from customer_total_return ctr1
,customer_address
,customer
where ctr1.ctr_total_return > ctr1.ctr_state_avg*1.2
and ca_address_sk = c_current_addr_sk
and ca_state = 'GA'
and ctr1.ctr_customer_sk = c_customer_sk
order by c_customer_id,c_salutation,c_first_name,c_last_name,c_preferred_cust_flag
,c_birth_day,c_birth_month,c_birth_year,c_birth_country,c_login,c_email_address
,c_last_review_date_sk,ctr_total_return
limit 100;

Hive select query return top 100 syntax error?

Here is my Hive query, straight from the TPC-DS toolkit:
WITH customer_total_return
AS (SELECT sr_customer_sk AS ctr_customer_sk,
sr_store_sk AS ctr_store_sk,
Sum(sr_fee) AS ctr_total_return
FROM store_returns,
date_dim
WHERE sr_returned_date_sk = d_date_sk
AND d_year = 2000
GROUP BY sr_customer_sk,
sr_store_sk)
SELECT TOP 100 c_customer_id
FROM customer_total_return ctr1,
store,
customer
WHERE ctr1.ctr_total_return > (SELECT Avg(ctr_total_return) * 1.2
FROM customer_total_return ctr2
WHERE ctr1.ctr_store_sk = ctr2.ctr_store_sk)
AND s_store_sk = ctr1.ctr_store_sk
AND s_state = 'TN'
AND ctr1.ctr_customer_sk = c_customer_sk
ORDER BY c_customer_id;
However, I get the following error when attempting to run it:
FAILED: ParseException line 11:11 cannot recognize input near 'TOP'
'100' 'c_customer_id' in selection target
My understanding is that TOP 100 is not syntactically valid in HiveQL. How can I rewrite this properly?
Use LIMIT instead of TOP, like this:
WITH customer_total_return
AS (SELECT sr_customer_sk AS ctr_customer_sk,
sr_store_sk AS ctr_store_sk,
Sum(sr_fee) AS ctr_total_return
FROM store_returns,
date_dim
WHERE sr_returned_date_sk = d_date_sk
AND d_year = 2000
GROUP BY sr_customer_sk,
sr_store_sk)
SELECT c_customer_id
FROM customer_total_return ctr1,
store,
customer
WHERE ctr1.ctr_total_return > (SELECT Avg(ctr_total_return) * 1.2
FROM customer_total_return ctr2
WHERE ctr1.ctr_store_sk = ctr2.ctr_store_sk)
AND s_store_sk = ctr1.ctr_store_sk
AND s_state = 'TN'
AND ctr1.ctr_customer_sk = c_customer_sk
ORDER BY c_customer_id
LIMIT 100;
This is a bad example of a query on many levels. I would suggest:
WITH customer_total_return AS (
SELECT sr.sr_customer_sk AS ctr_customer_sk,
sr.sr_store_sk AS ctr_store_sk,
SUM(sr.sr_fee) AS ctr_total_return,
AVG(SUM(sr.sr_fee)) OVER (PARTITION BY sr.sr_store_sk) as avg_store_sr_fee
FROM store_returns sr JOIN
date_dim d
ON sr.sr_returned_date_sk = d.d_date_sk
WHERE d_year = 2000
GROUP BY sr_customer_sk, sr_store_sk
)
SELECT c.c_customer_id
FROM customer_total_return ctr JOIN
store s
ON s.s_store_sk = ctr.ctr_store_sk JOIN
customer c
ON ctr.ctr_customer_sk = c.c_customer_sk
WHERE ctr.ctr_total_return > 1.2 * avg_store_sr_fee AND
s.s_state = 'TN'
ORDER BY c.c_customer_id
LIMIT 100;
Notes:
Never use commas in the FROM clause. Always use proper, explicit, standard JOIN syntax.
Qualify all column references, especially when a query has more than one table reference.
The subquery to calculate the average is not needed.
Hive uses LIMIT, not TOP.

Get Max date by another column

I am trying to write a simple query to get the MAX DEMAND_DATE for each INV_CART_ID. Here is my existing query:
SELECT BUSINESS_UNIT, INV_CART_ID, INV_ITEM_ID, CART_COUNT_QTY, DEMAND_DATE
FROM PS_CART_CT_INF_INV A
WHERE A.INV_ITEM_ID = 1
AND A.BUSINESS_UNIT = '11MMS'
AND A.CART_COUNT_QTY <> 0
ORDER BY DEMAND_DATE DESC
Current Output:
Desired Output:
BUSINESS_UNIT INV_CART_ID INV_ITEM_ID CART_COUNT_QTY DEMAND_DATE
11MMS 405 1 5.0000 2018-05-29
11MMS OUTPT_INFUSION 1 4.0000 2018-05-29
11MMS 938 1 15.0000 2018-05-31
11MMS 286 1 1.0000 2018-05-07
11MMS 708 1 4.0000 2018-04-05
This is what I have tried doing so far:
SELECT MAX(DEMAND_DATE) AS DEMAND_DATE, INV_CART_ID, BUSINESS_UNIT,
INV_ITEM_ID, CART_COUNT_QTY
FROM PS_CART_CT_INF_INV A
WHERE A.INV_ITEM_ID = 1
AND A.BUSINESS_UNIT = '11MMS'
AND A.CART_COUNT_QTY <> 0
AND A.DEMAND_DATE IN (SELECT MAX (DEMAND_DATE) FROM PS_CART_CT_INF_INV B
WHERE A.INV_ITEM_ID = B.INV_ITEM_ID GROUP BY INV_CART_ID)
GROUP BY INV_CART_ID, BUSINESS_UNIT, INV_ITEM_ID, CART_COUNT_QTY
However it doesn't return all INV_CART_ID #'s and is not retrieving the correct row (wrong DEMAND_DATE):
Use ROW_NUMBER:
WITH cte AS (
SELECT BUSINESS_UNIT, INV_CART_ID, INV_ITEM_ID, CART_COUNT_QTY, DEMAND_DATE,
ROW_NUMBER() OVER (PARTITION BY INV_CART_ID ORDER BY DEMAND_DATE DESC) rn
FROM PS_CART_CT_INF_INV
WHERE
INV_ITEM_ID = 1 AND
BUSINESS_UNIT = '11MMS' AND
CART_COUNT_QTY <> 0
)
SELECT
BUSINESS_UNIT, INV_CART_ID, INV_ITEM_ID, CART_COUNT_QTY, DEMAND_DATE
FROM cte
WHERE rn = 1
ORDER BY DEMAND_DATE DESC;
If you don't want to use analytic functions, then I still would not use your current approach. Instead, I would join to a subquery, like this:
SELECT
t1.BUSINESS_UNIT,
t1.INV_CART_ID,
t1.INV_ITEM_ID,
t1.CART_COUNT_QTY,
t1.DEMAND_DATE
FROM PS_CART_CT_INF_INV t1
INNER JOIN
(
SELECT INV_CART_ID, MAX(DEMAND_DATE) AS MAX_DEMAND_DATE
FROM PS_CART_CT_INF_INV
WHERE INV_ITEM_ID = 1 AND BUSINESS_UNIT = '11MMS' AND CART_COUNT_QTY <> 0
GROUP BY INV_CART_ID
) t2
ON t1.INV_CART_ID = t2.INV_CART_ID AND t1.DEMAND_DATE = t2.MAX_DEMAND_DATE
WHERE
t1.INV_ITEM_ID = 1 AND
t1.BUSINESS_UNIT = '11MMS' AND
t1.CART_COUNT_QTY <> 0;
The issue with your current query, even once corrected, is that it is using a correlated subquery in the WHERE clause. These are known to be potential performance killers, and so should be avoided if possible.
I think you want :
SELECT BUSINESS_UNIT, INV_CART_ID, INV_ITEM_ID, CART_COUNT_QTY, DEMAND_DATE
FROM PS_CART_CT_INF_INV AS a
WHERE INV_ITEM_ID = 1 AND BUSINESS_UNIT = '11MMS' AND
CART_COUNT_QTY <> 0 AND
DEMAND_DATE = (SELECT MAX(b.DEMAND_DATE)
FROM PS_CART_CT_INF_INV as b
WHERE a.INV_CART_ID = b.INV_CART_ID
);
However, this would gives you duplicate records, if you want to avoid duplicates then you can use identity column or pk instead in WHERE clause :
. . .
WHERE pk = (SELECT TOP (1) b.pk
FROM PS_CART_CT_INF_INV as b
WHERE a.INV_CART_ID = b.INV_CART_ID
ORDER BY b.DEMAND_DATE DESC
);

I need to pick highest NPA_TIME_ZONE_COUNT

I am using this query in Oracle.
SELECT /*+parallel (reject,4) */
distinct n.rowid as npanxx_row_id, r.rating_orignum_used, n.npa, n.nxx, npanxx_effdate, n.line_range_from_number, n.line_range_to_number, n.city, n.state, n.country, n.country_code, n.ocn, n.lata, n.clli_code, n.stepcode, n.juris, n.time_zone as current_time_zone--, x.time_zone as npanxx_timezone, x2.time_zone as npa_timezone, case when x.time_zone >= '1' then x.time_zone else x2.time_zone end new_time_zone, count(x2.time_zone) as npa_time_zone_count
from npanxx n
left join npanxx x
on n.npa = x.npa and (substr(n.nxx, 1,1) = substr(x.nxx,1,1))
and x.time_zone is not null and x.time_zone <> '0'
left join npanxx x2
on n.npa = x2.npa
and x2.time_zone is not null and x2.time_zone <> '0'
inner join reject r
on substr(r.rating_orignum_used,1,3) = n.npa and substr(r.rating_orignum_used,4,3) = n.nxx and substr(r.rating_orignum_used, 7,1) = substr(n.line_range_from_number,1,1)
where
n.npanxx_effdate = (select max(sub.npanxx_effdate) from npanxx sub where n.npa=sub.npa and n.nxx = sub.nxx and n.line_range_from_number = sub.line_range_from_number)
and r.carrier = 'LEVEL3' and r.error_code = '309' and r.rowid in ('AAQBSyAKKAABZ7yAAJ')and trunc(r.processdate) >= trunc(sysdate-90)
group by n.rowid, r.rating_orignum_used, n.npa, n.nxx, n.npanxx_effdate, n.line_range_from_number, n.line_range_to_number, n.city, n.state, n.country, n.country_code, n.ocn, n.lata, n.clli_code, n.stepcode, n.juris, n.time_zone, x.time_zone , x2.time_zone
By running this query I get the result
NPANXX_ROW_ID ..... npa_time_zone_count
AABWcFABmAAAxMrAAy 3780
AABWcFABmAAAxMrAAy 10
and I need one row with the highest count so it come as
NPANXX_ROW_ID ..... npa_time_zone_count
AABWcFABmAAAxMrAAy 3780
I used HAVING statement but its just giving me error
ORA-01427: single-row subquery returns more than one row
HAVING
COUNT(*) = (
SELECT
MAX(count(x2.time_zone))
FROM
npanxx inner
WHERE
inner.time_zone IS NOT NULL AND
inner.time_zone <> 0 AND
npa = inner.npa
and x2.NXX = INNER.NXX
GROUP BY
inner.state,
inner.country,
inner.time_zone)
You can use an analytic function ROW_NUMBER or DENSE_RANK to number rows and pick the highest, int this way:
WITH subquery AS
(
/* your complex query goes here */
SELECT 'AABWcFABmAAAxMrAAy' as NPANXX_ROW_ID, '....' As "....", 3780 As npa_time_zone_count
FROM dual
Union All
SELECT 'AABWcFABmAAAxMrAAy' as NPANXX_ROW_ID, '....' As "....", 10 As npa_time_zone_count
FROM dual
)
SELECT *
FROM (
SELECT t.*,
row_number() over (partition by NPANXX_ROW_ID
ORDER BY npa_time_zone_count DESC ) rn
FROM subquery t
)
WHERE rn = 1;
======================================================
NPANXX_ROW_ID .... NPA_TIME_ZONE_COUNT RN
------------------ ---- ------------------- ----------
AABWcFABmAAAxMrAAy .... 3780 1

Oracle SQL Group by Clause

I would like to write a sql to get top 5 table space storage metric. Below query gives the metric about all tbspaces. Appreciate if someone fine tune this to have only top N
SELECT
ts.tablespace_name AS TBNAME,
round((ts.tablespace_size/1024/1024),2) AS SIZE_MB,
round((ts.tablespace_used_size/1024/1024),2) AS USED_MB,
round(((ts.tablespace_size - ts.tablespace_used_size)/1024/1024),2) AS FREE_MB
FROM
mgmt$db_tablespaces ts,
(SELECT d.target_guid, d.tablespace_name, count(d.file_name) df_count,
sum(decode(d.autoextensible, 'YES', 1, 0)) auto_extend
FROM mgmt$db_datafiles d, mgmt$target t
WHERE t.target_guid = '<id>' AND
(t.target_type='rac_database' OR
(t.target_type='oracle_database' AND t.TYPE_QUALIFIER3 != 'RACINST')) AND
t.target_guid = d.target_guid
GROUP BY d.target_guid, d.tablespace_name) df
WHERE
ts.target_guid = df.target_guid AND
df.tablespace_name = ts.tablespace_name
ORDER BY ts.tablespace_size;`
Thanks
You can use the ROWNUM. Oracle applies rownum to the result after it has been returned.
You need to filter the result after it has been returned, so a subquery is required. You can also use RANK() function to get Top-N results.
SELECT
*
FROM
(
SELECT
ts.tablespace_name AS TBNAME,
round((ts.tablespace_size/1024/1024),2) AS SIZE_MB,
round((ts.tablespace_used_size/1024/1024),2) AS USED_MB,
round(((ts.tablespace_size - ts.tablespace_used_size)/1024/1024),2) AS FREE_MB
FROM
mgmt$db_tablespaces ts,
(SELECT d.target_guid, d.tablespace_name, count(d.file_name) df_count,
sum(decode(d.autoextensible, 'YES', 1, 0)) auto_extend
FROM mgmt$db_datafiles d, mgmt$target t
WHERE t.target_guid = '<id>' AND
(t.target_type='rac_database' OR
(t.target_type='oracle_database' AND t.TYPE_QUALIFIER3 != 'RACINST')) AND
t.target_guid = d.target_guid
GROUP BY d.target_guid, d.tablespace_name) df
WHERE
ts.target_guid = df.target_guid AND
df.tablespace_name = ts.tablespace_name
ORDER BY ts.tablespace_size
)
WHERE ROWNUM <= 5;