HIVE: Error in GROUP BY Key - sql

hive -e "select a.EMP_ID,
count(distinct c.SERIAL_NBR) as NUM_CURRENT_EMP,
count(distinct c.SERIAL_NBR)/count(distinct a.SERIAL_NBR) as DISTINCT_EMP
from ORDERS_COMBINED_EMPLOYEES as a
inner join ORDERS_EMPLOYEE_STATS as b
on a.CPP_ID = b.CPP_ID
left join ( select SERIAL_NBR, MIN(TRAN_DT) as TRAN_DT
from EMP_TXNS
group by SERIAL_NBR
) c
on c.SERIAL_NBR = a.SERIAL_NBR
where c.TRAN_DT > a.LAST_TXN_DT
group by a.EMP_ID
having (
(NUM_CURRENT_EMP >= 25 and DISTINCT_EMP > 0.01)
) ; " > EMPLOYEE_ORDERS.txt
Getting error message,
"FAILED: SemanticException [Error 10025]: Line 15:31 Expression not in GROUP BY key '0.01'".
When I ran the same query with just one condition in HAVING clause as NUM_CURRENT_EMP >= 25, the query ran fine without any issues. NUM_CURRENT_EMP is a int type and DISTINCT_EMP is float in the table where I am trying to insert the results. Breaking my head.
Any help is appreciated.

What happens if you replace the aliases in the having with the expressions that define them?
having count(distinct c.SERIAL_NBR) >= 25 and
count(distinct c.SERIAL_NBR)/count(distinct a.SERIAL_NBR) > 0.01

Related

Query error: Column name ICUSTAY_ID is ambiguous. Using multiple subqueries in BigQuery

Hi, I receive the following query error "Query error: Column name ICUSTAY_ID is ambiguous" referred to the third last line of code (see the following code). Please can you help me? Thank you so much!
I am an SQL beginner..
WITH t AS
(
SELECT
*
FROM
(
SELECT *,
DATETIME_DIFF(CHARTTIME, INTIME, MINUTE) AS pi_recorded
FROM
(
SELECT
*
FROM
(
SELECT * FROM
(SELECT i.SUBJECT_ID, p.dob, i.hadm_id, p.GENDER, a.ETHNICITY, a.ADMITTIME, a.INSURANCE, i.ICUSTAY_ID,
i.DBSOURCE, i.INTIME, DATETIME_DIFF(a.ADMITTIME, p.DOB, DAY) AS age,
CASE
WHEN DATETIME_DIFF(a.ADMITTIME, p.DOB, DAY) <= 32485
THEN 'adult'
WHEN DATETIME_DIFF(a.ADMITTIME, p.DOB, DAY) > 32485
then '>89'
END AS age_group
FROM `project.mimic3.ICUSTAYS` AS i
INNER JOIN `project.mimic3.PATIENTS` AS p ON i.SUBJECT_ID = p.SUBJECT_ID
INNER JOIN `project.mimic3.ADMISSIONS` AS a ON i.HADM_ID = a.HADM_ID)
WHERE age >= 6570
) AS t1
LEFT JOIN
(
SELECT ITEMID, ICUSTAY_ID, CHARTTIME, VALUE, FROM `project.mimic3.CHARTEVENTS`
WHERE ITEMID = 551 OR ITEMID = 552 OR ITEMID = 553 OR ITEMID = 224631
OR ITEMID = 224965 OR ITEMID = 224966
) AS t2
ON t1.ICUSTAY_ID = t2.ICUSTAY_ID
)
)
WHERE ITEMID IN (552, 553, 224965, 224966) AND pi_recorded <= 1440
)
SELECT ICUSTAY_ID #### Query error: Column name ICUSTAY_ID is ambiguous
FROM t
GROUP BY ICUSTAY_ID;
Both t1 and t2 have a column called ICUSTAY_ID. When you join them together into a single dataset you end up with 2 columns with the same name - which obviously can't work as there would be no way of uniquely identify each column.
You need to alias these columns in you code or not include one or the other if you don't need both

how to run a different select statement based on condition in Hive SQL

I would like to know how to run a different select statement based on condition in Hive SQL.
The following query does not work but throws an error.
Error while compiling statement: FAILED: ParseException line 4:2
cannot recognize input near '(' 'SELECT' '1' in expression
specification
SELECT
CASE WHEN '${UN}'!= '' THEN
(
SELECT *
from table1 t
WHERE t.yymmddval BETWEEN '${D1}' AND '${D2}'
AND t.un in ('${UN}')
)
ELSE
(
SELECT *
from table1 t
WHERE t.yymmddval BETWEEN '${D1}' AND '${D2}'
AND t.un in (
(SELECT
o.unq_num as un
FROM table2 as o
WHERE o.date >= '2017-01-01'
AND upper(o.srl_num) in ('${R}')
LIMIT 1)
)
)
END
Use UNION ALL with your queries + add conditions for switching corresponding query:
select *
from table1 t
where (t.yymmddval BETWEEN '${D1}' and '${D2}')
and t.un in ('${UN}')
and '${UN}'!= '' --switching condition
union all
select *
from table1 t
where (t.yymmddval BETWEEN '${D1}' AND '${D2}')
and t.un in
(SELECT
o.unq_num as un
FROM table2 as o
WHERE o.date >= '2017-01-01'
AND upper(o.srl_num) in ('${R}')
LIMIT 1)
and '${UN}'= '' --switching condition

Hive - select rows within 1 year of earliest date

I am trying to select all rows in a table that are within 1 year of the earliest date in the table. I'm using the following code:
select *
from baskets a
where activitydate < (select date_add((select min(activitydate) mindate_a from baskets), 365) date_b from baskets)
limit 10;
but get the following error message:
Error while compiling statement: FAILED: ParseException line 1:55 cannot recognize input near 'select' 'date_add' '(' in expression specification
Total execution time: 00:00:00.338
Any suggestions?
EDIT:
With this code:
select *
from baskets a
where activitydate < (select date_add(min(activitydate), 365) from baskets)
limit 10;
I'm getting this error:
Error while compiling statement: FAILED: ParseException line 1:55 cannot recognize input near 'select' 'date_add' '(' in expression specification
I'd be tempted to use window functions:
select b.*
from (select b.*, min(activity_date) as min_ad
from baskets b
) b
where activity_date < add_months(min_ad, 12);
If you really want your syntax to work, try reducing the number of selects:
where activitydate < (select date_add(min(activitydate), 365) from baskets)
Use JOINs instead of select in Sub-query. I don't think Hive supports select in where clause with < condition. Only IN and EXISTS could be used as of Hive 0.13.
: Language Manual SubQueries
SELECT a.*
FROM baskets a
JOIN (SELECT DATE_ADD(MIN(b.activitydate), 365) maxdate
FROM baskets) b
ON a.activitydate < b.maxdate
LIMIT 10;

Hive using sum() over function error--Failed to breakup Windowing invocations into Groups

I'm trying to run this query which includes sum(...) over(...) function in Hive but shows error.
And I'm trying to use select distinct in a subquery, but it still doesn't work.
Is there an error in join function?
This is my SQL code
select
c.driver_city_name,
c.driver_car_brand,
c.year,
count(distinct c.driver_id)over(PARTITION BY c.driver_car_brand,c.driver_city_name,c.year),
sum(c.upply_hr)over(PARTITION BY c.driver_car_brand,c.driver_city_name,c.year),
sum(c.work_hr)over(PARTITION BY c.driver_car_brand,c.driver_city_name,c.year)
from (
select
a.driver_city_name,
a.driver_car_brand,
a.driver_id,
year(b.reg_date_cheling) as year,
d.onlinetime as supply_hr,
d.charge_time_length/60/60 as work_hr
from gulfstream_dw.dw_v_driver_base a
join (
select
driver_id,
reg_date_cheling
from g_bi.t_brx_xinzheng_driver_diaodu_1
where driver_city_id in (17,18,2,3,1,5,10,4,24,23)
and concat_ws('-',year,month,day) BETWEEN '2016-11-19' and '2017-02-21'
) b on a.driver_id = b.driver_id
join (
select
e.driver_id,
sum(e.onlinetime) as onlinetime, --在线时长(单位:小时)
sum(e.charge_time_length) as charge_time_length --计费总时长(单位:秒)
from(
select distinct
concat_ws('-',year,month,day) date1,
driver_id,
onlinetime, --在线时长(单位:小时)
charge_time_length --计费总时长(单位:秒)
from gulfstream_dw.dw_m_driver_strategy
where concat_ws('-',year,month,day) between '2016-11-19' and '2017-02-21'
and onlinetime > 0
) e
group by e.driver_id
) d on a.driver_id = d.driver_id
where driver_city_id in (17,18,2,3,1,5,10,4,24,23)
and to_date(max_success_strive_time) BETWEEN '2016-11-20' and '2017-02-20'
and concat_ws('-',year,month,day) BETWEEN '2016-11-19' and '2017-02-21'
and a.driver_car_brand in (
"奇瑞-E3",
"吉利-远景",
)
group by a.driver_city_name,a.driver_car_brand,a.driver_id,year(b.reg_date_cheling),d.onlinetime,d.charge_time_length/60/60
)c
group by c.driver_city_name,c.driver_car_brand,c.year
But there is error:
ERROR Type:SEMANTIC_FAILED Error while compiling statement: FAILED:
SemanticException Failed to breakup Windowing invocations into Groups.
At least 1 group must only depend on input columns. Also check for
circular dependencies. Underlying error:
org.apache.hadoop.hive.ql.parse.SemanticException: Line 6:6 Invalid
column reference 'supply_hr'
If I leave out sum over function and just keep count over function, it runs successfully.

Join Oracle tables on an exact match, and a closest match

I am trying to join two tables of performance metrics, system stats and memory usage. Entries in these tables come in on differing time schedules. I need to join the tables by finding the exact match for the System_Name in both tables, and the closest for WRITETIME. Write time uses the systems own idea of time and is NOT a standard Oracle timestamp.
I can select the closest timestamp from one table with something like:
select "Unix_Memory"."WRITETIME", ABS ('1140408134015004' - "Unix_Memory"."WRITETIME")
as Diff from "Unix_Memory"
where "Unix_Memory"."WRITETIME" > '1140408104015004' order by Diff;
The constants there will be parameterised in my script.
However when I try to expand this into my larger query:
select "System"."System_Name", "System"."WRITETIME" as SysStamp,
from "System"
join "Unix_Memory" on "System"."System_Name" = "Unix_Memory"."System_Name"
and "Unix_Memory"."WRITETIME" = (
select Stamp from (
select "Unix_Memory"."WRITETIME" as Stamp,
ABS ( "System"."WRITETIME" - "Unix_Memory"."WRITETIME") as Diff
from "Unix_Memory" where "Unix_Memory"."WRITETIME" > '1140408104015004' and rownum = 1 order by Diff
)
)
WHERE "System"."System_Name" in ('this','that', 'more')
and "System"."WRITETIME" > '1140408124015004';
I get:
Error at Command Line:38 Column:72
Error report:
SQL Error: ORA-00904: "System"."WRITETIME": invalid identifier
00904. 00000 - "%s: invalid identifier"
I have tried a few variations, but I am not getting any closer.
You must state the System table in the inner Select as well.
select "System"."System_Name", "System"."WRITETIME" as SysStamp,
from "System"
join "Unix_Memory" on "System"."System_Name" = "Unix_Memory"."System_Name"
and "Unix_Memory"."WRITETIME" = (
select Stamp from (
select "Unix_Memory"."WRITETIME" as Stamp,
ABS ( "System"."WRITETIME" - "Unix_Memory"."WRITETIME") as Diff
from "Unix_Memory"
-- THE NEXT LINE IS MISSING IN YOUR CODE
INNER JOIN "System" ON "System.System_Name" = "Unix_Memory"."System_Name"
and "System"."WRITETIME" > '1140408124015004'
-- end of missing
where "Unix_Memory"."WRITETIME" > '1140408104015004' and rownum = 1 order by Diff
)
)
WHERE "System"."System_Name" in ('this','that', 'more')
and "System"."WRITETIME" > '1140408124015004';
Unfortunately the column names are only known in the next nesting level. So System.writetime would be known in select Stamp from ..., but no more in select "Unix_Memory"."WRITETIME" as Stamp ...
Anyhow, you would select a rather random stamp anyhow, the first Unix_Memory"."WRITETIME" > '1140408104015004' found to be precise, because rownum = 1 gets executed before order by. You will have to re-write your statement completely.
EDIT: Here is one possibility to re-write the statement using MIN/MAX KEEP:
select
s.system_name,
s.writetime as sysstamp,
min(um.id) keep (dense_rank first order by abs(s.writetime - um.writetime)) as closest_um_id
from system sys
join unix_memory um on s.system_name = um.system_name
where s.system_name in ('this','that', 'more')
and s.writetime > '1140408124015004'
and um.writetime > '1140408104015004'
group by s.system_name, s.writetime
order by s.system_name, s.writetime;
If you need more than just the ID of unix_memory then surround this with another select:
select
sy.system_name,
sy.sysstamp,
mem.*
from
(
select
s.system_name,
s.writetime as sysstamp,
min(um.id) keep (dense_rank first order by abs(s.writetime - um.writetime)) as closest_um_id
from system sys
join unix_memory um on s.system_name = um.system_name
where s.system_name in ('this','that', 'more')
and s.writetime > '1140408124015004'
and um.writetime > '1140408104015004'
group by s.system_name, s.writetime
) sy
join unix_memory mem on mem.id = sy.closest_um_id
order by sy.system_name, sy.sysstamp;