How to improve performance in hive

How to improve performance in hive - hive

I am running below query which is around 2 million in hive. Is there any way to improve the performance? The source hive table is partition column of created_date
select t.id,
case when t.amt_1_rank < 0.3*f.amt_1_count then t.amt_1 else null end as amt_1,
case when t.amt_2_rank < 0.3*f.amt_2_count then t.amt_2 else null end as amt_2,
..
..
.. -- Like wise 30 columns e.g. amt_3,amt_3...
from (
select a.id,
a.amt_1,
row_number() over (ORDER BY cast(a.amt_1 AS DECIMAL(8,7)) DESC) AS amt_1_rank,
a.amt_2,
row_number() over (ORDER BY cast(a.amt_2 AS DECIMAL(8,7)) DESC) AS amt_2_rank
from source_table a WHERE created_date='2017-10-15' )t
join
(
SELECT count(case when amt_1='.' then null else 1 end) AS amt_1_count,
count(case when amt_2='.' then null else 1 end) AS amt_2_count,
..
..
FROM source_table
WHERE created_date='2017-10-15'
) f

You can do it without join:
select t.id,
case when t.amt_1_rank < 0.3*t.amt_1_count then t.amt_1 else null end as amt_1,
case when t.amt_2_rank < 0.3*t.amt_2_count then t.amt_2 else null end as amt_2,
..
..
.. -- Like wise 30 columns e.g. amt_3,amt_3...
from (
select a.id,
a.amt_1,
row_number() over (ORDER BY cast(a.amt_1 AS DECIMAL(8,7)) DESC) AS amt_1_rank,
a.amt_2,
row_number() over (ORDER BY cast(a.amt_2 AS DECIMAL(8,7)) DESC) AS amt_2_rank,
count(amt_1_flag) over() AS amt_1_count,
count(amt_2_flag) over() AS amt_2_count
from
(select a.*,
case when amt_1='.' then null else 1 end as amt_1_flag,
case when amt_2='.' then null else 1 end as amt_2_flag
from source_table a WHERE created_date='2017-10-15'
)a
)t

Related

SQL Function for updating column with values

Those who have helped me before, i tend to use SAS9.4 a lot for my day to day work, however there are times when i need to use SQL Server
There is a output table i have with 2 variables (attached output.csv)
output table
ID, GROUP, DATE
The table has 830 rows:
330 have a "C" group
150 have a "A" group
50 have a "B" group
the remaining 300 have group as "TEMP"
within SQL i do not now how to programatically work out the total volume of A+B+C. The aim is to update "TEMP" column to ensure there is an Equal amount of "A" and "B" totalling 250 of each (the remainder of the total count)
so the table totals
330 have a "C" group
250 have a "A" group
250 have a "B" group

You want to proportion the "temp" to get equal amounts of "A" and "B".
So, the idea is to count up everything in A, B, and Temp and divide by 2. That is the final group size. Then you can use arithmetic to allocate the rows in Temp to the two groups:
select t.*,
(case when seqnum + a_cnt <= final_group_size then 'A' else 'B' end) as allocated_group
from (select t.*, row_number() over (order by newid()) as seqnum
from t
where group = 'Temp'
) t cross join
(select (cnt_a + cnt_b + cnt_temp) / 2 as final_group_size,
g.*
from (select sum(case when group = 'A' then 1 else 0 end) as cnt_a,
sum(case when group = 'B' then 1 else 0 end) as cnt_b,
sum(case when group = 'Temp' then 1 else 0 end) as cnt_temp
from t
) g
) g
SQL Server makes it easy to put this into an update:
with toupdate as (
select t.*,
(case when seqnum + a_cnt <= final_group_size then 'A' else 'B' end) as allocated_group
from (select t.*, row_number() over (order by newid()) as seqnum
from t
where group = 'Temp'
) t cross join
(select (cnt_a + cnt_b + cnt_temp) / 2 as final_group_size,
g.*
from (select sum(case when group = 'A' then 1 else 0 end) as cnt_a,
sum(case when group = 'B' then 1 else 0 end) as cnt_b,
sum(case when group = 'Temp' then 1 else 0 end) as cnt_temp
from t
) g
) g
)
update toupdate
set group = allocated_group;

I'd go with a top 250 update style approach
update top (250) [TableName] set Group = 'A' where exists (Select * from [TableName] t2 where t2.id = [TableName].id order by newid()) and Group = 'Temp'
update top (250) [TableName] set Group = 'B' where exists (Select * from [TableName] t2 where t2.id = [TableName].id order by newid()) and Group = 'Temp'

Records apart form max date

I have been helped by Metal to write a SQL as below
select id
, OrderDate
, RejectDate
, max(case when RejectDate = '1900-01-01' then '9999-12-31' else RejectDate end) as rSum
from tableA
group by id, OrderDate, RejectDate
Now, I would like to find out all the records for a partcular id below the max reject date to delete them from a transformation table

An option is to use row_number():
select
id,
OrderDate,
RejectDate
from (
select
t.*,
row_number() over(
partition by id
order by case when RejectDate = '1900-01-01' then '9999-12-31' else RejectDate end desc
) rn
from tableA t
) t
where rn > 1
The advantage of this technique is that it avoids aggregation, which may lead to better performance. Also, you can easily turn this into a delete statement by leveraging the concept of updateable CTE, as follows:
with cte as (
select
row_number() over(
partition by id
order by case when RejectDate = '1900-01-01' then '9999-12-31' else RejectDate end desc
) rn
from tableA t
)
delete from cte where rn > 1

This should work...
SELECT *
FROM tableA t1
INNER JOIN (
SELECT ID, MAX(RejectDate) as MaxRejectDate
FROM tableA) t2 ON t1.ID = t2.ID
WHERE t1.RejectDate < t2.MaxRejectDate

Oracle Database Rownum Between

There are 25 records in this sql query
I want to get between 5 and 10. How can I do it ?
I use 11g
select
(
select count(*) as sayfasayisi
from konular t
where t.kategori is not null
) as sayfasayisi,
t.id,
t.uye,
t.baslik,t.mesaj,t.kategori,t.tarih,
t.edittarih,t.aktif,t.indirimpuani,t.altkategori,t.link,
nvl(
(select case when t.id = f.konuid and f.uye = 'test' then '1' else '0' end
from takipkonu f where t.id = f.konuid and f.uye = 'test'), '0') as takip
from konular t
where t.kategori is not null

You can use ROW_NUMBER() to assign a row number based on some ordering logic contained in your current query, e.g. a certain column. Then, retain only the 5th to 10th records:
select t.*
from
(
select
(
select count(*) as sayfasayisi
from konular t
where t.kategori is not null
) as sayfasayisi,
ROW_NUMBER() OVER (ORDER BY some_col) rn,
t.id,
t.uye,
...
) t
where t.rn between 5 and 10;

SQL Server duplicate row

I have a table with duplicate records. I want to mark whether the record is a duplicate or not in a another column, let's say a column name Flag. If the records is a duplicate mark it as 1 in Flag column else 0.
How to do this?
I can use a query to select duplicate records.
select
o.clientid, oc.dupeCount, o.pannodesc, o.CustNo
from
CustomerMaster1 o
inner join
(SELECT clientid, COUNT(*) AS dupeCount
FROM CustomerMaster1
WHERE ISNULL(PanNoDesc, '') <> ''
GROUP BY clientid
HAVING COUNT(*) > 1) oc ON o.clientid = oc.clientid
Simply saying, if there are two similar records, mark 1 against the second duplicated row, if three similar records mark 1 against two rows, leaving the original record as 0.

Just use count(*) as a window function to calculate the flag:
select o.clientid, oc.dupeCount, o.pannodesc, o.CustNo,
(case when count(*) over (partition by clientId) > 1
then 1 else 0
end) as IsDuplicate
from CustomerMaster1 o;
If you only case about certain records, then you can count them instead:
select o.clientid, oc.dupeCount, o.pannodesc, o.CustNo,
(case when sum(case when PanNoDesc <> '' or PanNoDesc is not null
then 1 else 0
end) over (partition by clientId) > 1
then 1 else 0
end) as IsDuplicate
from CustomerMaster1 o;
EDIT:
If you want to modify the data, assuming you have a flag, you can just use these statements as a CTE:
with toupdate as (
select o.clientid, oc.dupeCount, o.pannodesc, o.CustNo,
(case when sum(case when PanNoDesc <> '' or PanNoDesc is not null
then 1 else 0
end) over (partition by clientId) > 1
then 1 else 0
end) as NewIsDuplicate
from CustomerMaster1 o
)
update toupdate
set Flag = NewIsDuplicate;

You can write as
CREATE TABLE CustomerMaster1 (clientid INT,PanNoDesc VARCHAR(10),DupFlag bit)
INSERT INTO CustomerMaster1 VALUES(1,'A',NULL ),(1,'B',NULL )
SELECT clientid,PanNoDesc,DupFlag FROM CustomerMaster1
;WITH CTE AS(
SELECT clientid,
ROW_NUMBER()OVER (PARTITION BY clientid ORDER BY clientid ASC) AS rownum
FROM CustomerMaster1
WHERE ISNULL(PanNoDesc, '') <> ''
)
UPDATE T
SET T.DupFlag = (case WHEN rownum > 1 THEN 1 ELSE 0 END)
FROM CustomerMaster1 T
JOIN CTE ON CTE.clientid = T.clientid
SELECT clientid,PanNoDesc,DupFlag FROM CustomerMaster1
demo
Edit: Demo based on sample fields provided:
http://sqlfiddle.com/#!3/4592f/1

Joining two different queries under one answer

I have two different queries that have produced the correct result, but I would like to have them produce the answer out in one table. How do I do that?
Here is my code:
SELECT count(distinct ID) as NoOfEmployees
FROM Table_Name
WHERE date<= '2012-05-31';
select count(subA.ID) as EmployeesChanged from (
SELECT A.ID
FROM Table_Name A
WHERE A.date < '2012-06-01'
GROUP BY 1
HAVING COUNT(A.Service_type) > 1 ) subA
Currently I have the following output:
Number of Employees
x
Employees Changed
x
How do I make it
Number of Employees | Employees Changed | (Number of employees - number changed)
x | x | x

I don't know what database do you use. But for some databases you can try:
select q1.Value, q2.Value, q1.Value - q2.Value from
(SELECT count(distinct ID) as Value FROM Table_Name
WHERE date<= '2012-05-31') q1,
(select count(subA.ID) as Value from
( SELECT A.ID FROM Table_Name A
WHERE A.date < '2012-06-01' GROUP BY 1
HAVING COUNT(A.Service_type) > 1 ) subA) q2

If date<= '2012-05-31' is the same as A.date < '2012-06-01' ?
SELECT COUNT(1) AS NoOfEmployees,
SUM(CASE WHEN STCount > 0 then 1 else 0 end) as HasChange,
SUM(CASE WHEN STCount = 0 then 1 else 0 end) as NoChange
FROM
(SELECT ID,
COUNT(A.Service_type) STCount
FROM Table_Name
WHERE date<= '2012-05-31'
GROUP BY ID) AS Data

You can use CROSS JOIN:
SELECT a.*, b.*, a.NoOfEmployees - b.EmployeesChanged
FROM
(
SELECT count(distinct ID) as NoOfEmployees
FROM Table_Name
WHERE date<= '2012-05-31'
) a
CROSS JOIN
(
SELECT count(subA.ID) as EmployeesChanged
FROM
(
SELECT A.ID
FROM Table_Name A
WHERE A.date < '2012-06-01'
GROUP BY 1
HAVING COUNT(A.Service_type) > 1
) subA
) b
Edit:
You might be able to greatly optimize your query by using conditional aggregation instead of executing two separate queries:
SELECT a.NoOfEmployees, a.EmployeesChanged, a.NoOfEmployees - a.EmployeesChanged
FROM
(
SELECT
COUNT(DISTINCT CASE WHEN date <= '2012-05-31' THEN ID END) as NoOfEmployees,
COUNT(DISTINCT CASE WHEN date < '2012-06-01' AND COUNT(Service_type) > 1 THEN ID END) AS EmployeesChanged
FROM Table_Name
GROUP BY ID
) a

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

How to improve performance in hive - hive

Related

SQL Function for updating column with values

Records apart form max date

Oracle Database Rownum Between

SQL Server duplicate row

Joining two different queries under one answer

Categories

Resources