Ways to optimize hive query - hive

I have the following hive query
DROP TABLE dwo_analysis.spark_custom_global_authenticated_experiment_dashboard_report_activity;
CREATE TABLE dwo_analysis.spark_custom_global_authenticated_experiment_dashboard_report_activity (
experiment_name varchar(255),
variant_name varchar(255),
first_date string,
guid string,
click_date date,
create int,
publish int,
sumCreate int,
sumPublish int
)
PARTITIONED BY (click_date date)
STORED AS ORC tblproperties("compress.mode"="SNAPPY");
INSERT INTO TABLE dwo_analysis.spark_custom_global_authenticated_experiment_dashboard_report_activity
SELECT
'EmailDripCampaignGlobal' as experiment_name,
'treatment' as variant_name,
MIN(TO_DATE(b.min_date)) as first_date,
SUBSTR(post_evar12,1,24) AS guid,
click_date,
MAX(CASE WHEN post_prop5='project:createClicked' THEN 1 ELSE 0 END) AS create,
MAX(CASE WHEN post_prop5='project:exportCompleted' OR post_prop5='project:reExportCompleted' THEN 1 ELSE 0 END) AS publish,
SUM(CASE WHEN post_prop5='project:createClicked' THEN 1 ELSE 0 END) AS sumCreate,
SUM(CASE WHEN post_prop5='project:exportCompleted' OR post_prop5='project:reExportCompleted' THEN 1 ELSE 0 END) AS sumPublish
FROM sourcedata.sc_visitor_click_history_jun_2015 sc
INNER JOIN dwo_analysis.spark_experiment_email_drip_treatment b
ON SUBSTR(sc.post_evar12,1,24) = b.guid
WHERE report_suite='adbemmarvelweb.prod'
AND sc.date_time >= b.min_date
AND click_date >= '2018-01-01'
AND click_date < DATE_SUB(CURRENT_DATE, 3)
GROUP BY SUBSTR(post_evar12,1,24), click_date;
It takes a long time to execute. Does anyone have any suggestions on how I can optimize? Reason why it takes a long time is because the sc_visitor_click_history_jun_2015 table is 10T heavy

Enable hive vectorization and try.
set hive.vectorized.execution.enabled = true;

Related

Can this be done as a SQL VIEW

I have a SQL Server table of Customer's Events:
CREATE TABLE CustomerEvent
(
CustomerID int,
EventType int,
EventDate datetime
)
There can be many EventTypes for the same customer in one day.
EventTypes are like
1 - CheckIn
2 - CheckOut
3 - ExamStart
4 - ExamEnd
Now I want to select Customers that are currently (today) on premises. That's Clients who had checked in, but hadn't checked out, no matter if they taking Exam currently or not. Can this be done as a SQL view, or do I have to write a stored procedure for that?
You want today. So I would suggest not exists:
select ce.customerid
from customerevent ce
where eventtype = 1 and
event_date >= current_date and
event_date < current_date + interval '1 day' and
not exists (select 1
from customerevent ce2
where ce2.customerid = ce.customerid and
ce2.eventtype = 2 and
ce2.eventdate > ce.eventdate
);
You can easily incorporate this into a view.
Note: date/time functions are notoriously database specific, so the exact syntax for "today" may vary.
EDIT:
In SQL Server, this can be written as:
select ce.customerid
from customerevent ce
where eventtype = 1 and
convert(date, event_date) >= concat(date, current_date) and
not exists (select 1
from customerevent ce2
where ce2.customerid = ce.customerid and
ce2.eventtype = 2 and
ce2.eventdate > ce.eventdate
);
You can use aggregation, and filter with a having clause that compares the last check in of each customer to their last check out:
create view customerview as
select customerid
from customerevent
group by customerid
having
max(case when eventtype = 1 then eventdate end)
> max(case when eventtype = 2 then eventdate end)
or (
max(case when eventtype = 1 then eventdate end) is not null
and max(case when eventtype = 2 then eventdate end) is null
)
The second condition in the having condition handles customers that checked in at least once but never checked out.
We can simplify the query a little with coalesce() and a fixed date that you are sure to be prior to any row in your table:
having max(case when eventtype = 1 then eventdate end)
> max(case when eventtype = 2 then eventdate else '19700101' end)

Categorize Overlap Type - Oracle

I've been able to use How do I find the total number of used days in a month? to answer how many TOTAL days we've had cats and dogs which is helpful but I need to know how many days we had:
Cats only: 4
Dogs only: 5
Both: 6
Thank you in advance!
CREATE TABLE "ANIMALGUESTS"
( "ID" NUMBER,
"GUESTNAME" VARCHAR2(20 BYTE),
"GUESTTYPE" VARCHAR2(20 BYTE),
"CHECKIN" DATE,
"CHECKOUT" DATE
);
Insert into ANIMALGUESTS (ID,GUESTNAME,GUESTTYPE,CHECKIN,CHECKOUT) values (1,'Tom','Cat',to_date('01-JAN-19','DD-MON-RR'),to_date('10-JAN-19','DD-MON-RR'));
Insert into ANIMALGUESTS (ID,GUESTNAME,GUESTTYPE,CHECKIN,CHECKOUT) values (2,'Spike','Dog',to_date('03-JAN-19','DD-MON-RR'),to_date('05-JAN-19','DD-MON-RR'));
Insert into ANIMALGUESTS (ID,GUESTNAME,GUESTTYPE,CHECKIN,CHECKOUT) values (3,'Spike','Dog',to_date('08-JAN-19','DD-MON-RR'),to_date('12-JAN-19','DD-MON-RR'));
Insert into ANIMALGUESTS (ID,GUESTNAME,GUESTTYPE,CHECKIN,CHECKOUT) values (4,'Cherie','Cat',to_date('07-JAN-19','DD-MON-RR'),to_date('09-JAN-19','DD-MON-RR'));
Insert into ANIMALGUESTS (ID,GUESTNAME,GUESTTYPE,CHECKIN,CHECKOUT) values (5,'Tyke','Dog',to_date('10-JAN-19','DD-MON-RR'),to_date('15-JAN-19','DD-MON-RR'));
Using conditional aggregation and inline calendar table:
WITH cte AS (
SELECT DATE '2019-01-01' + rownum -1 dt FROM DUAL CONNECT BY ROWNUM < 366
)
SELECT DISTINCT
SUM(CASE WHEN COUNT(DISTINCT GUESTTYPE)=2 THEN 1 END) OVER() AS both,
SUM(CASE WHEN COUNT(DISTINCT GUESTTYPE)=1 AND MIN(GUESTTYPE)='Cat' THEN 1 END) OVER() AS cats_only,
SUM(CASE WHEN COUNT(DISTINCT GUESTTYPE)=1 AND MIN(GUESTTYPE)='Dog' THEN 1 END) OVER() AS dogs_only
FROM cte c
LEFT JOIN "ANIMALGUESTS" a ON c.dt BETWEEN a.CHECKIN AND a.CHECKOUT
GROUP BY dt;
db<>fiddle demo
Oracle 12c supports recursive CTEs, so you can expand the data and then aggregate:
with cte as (
select checkin as dt, checkout, guesttype
from ANIMALGUESTS
union all
select dt + 1, checkout, guesttype
from cte
where dt < checkout
)
select sum(case when cats > 0 and dogs > 0 then 1 else 0 end) as both,
sum(case when cats > 0 and dogs = 0 then 1 else 0 end) as cats_only,
sum(case when cats = 0 and dogs > 0 then 1 else 0 end) as dogs_only
from (select dt, sum(case when guesttype = 'Cat' then 1 else 0 end) as cats,
sum(case when guesttype = 'Dog' then 1 else 0 end) as dogs
from cte
group by dt
) cte;
This generates the result set as columns in a row, rather than separate rows.

Difference between rows with the same value

I have the table
create table fct_bonus (
date timestamp not null,
type varchar(10) not null,
amount numeric(19,2) not null,
userid varchar(30) not null
)
type can be IN or OUT, amount is always >0
I need to find sums of ins and outs for userid 123 on date 2016-08-01', and also the balans, which should be count as all ins minus all outs of userid123.
I use the query
select distinct userid, type, sum(amount)
from fct_bonus
where userid = 123 and date <= '2016-08-01'
group by type
but I don't know, how to count the balans. Please, help.
This would seem to do what you are describing:
select userid,
sum(case when type = 'IN' then 1 else 0 end) as ins,
sum(case when type = 'OUT' then 1 else 0 end) as outs,
sum(case when type = 'IN' then amount when type = 'OUT' then - amount end) as balance
from fct_bonus
where userid = 123 and date <= '2016-08-01'
group by userid;

How to show 0 value using COUNT and SELECTon a SQL query

I have ONLY 1 table called Meeting that stores all meeting requests.
This table can be EMPTY.
It has several columns including requestType (which can only be "MT") meetingStatus (can only be either pending, approved, denied or canceled) and meetingCreatedTime
I want to count how many requests of each status's type (in other words how many requests are pending, how many are approved, denied and canceled) for the last 30 days
Problem is that if there is no request then nothing display but I want to display 0, how do I do it? Here is my query now:
SELECT [requestType],
( SELECT COUNT ([requestType]) FROM [Meeting] WHERE CAST([meetingCreatedTime] AS DATE) >= CAST(DateAdd(DAY,-30,Getdate()) AS DATE) AND [meetingStatus] = 'Approved') As 'Approved',
( SELECT COUNT ([requestType]) FROM [Meeting] WHERE CAST([meetingCreatedTime] AS DATE) >= CAST(DateAdd(DAY,-30,Getdate()) AS DATE) AND [meetingStatus] = 'Pending') As 'Pending',
( SELECT COUNT ([requestType]) FROM [Meeting] WHERE CAST([meetingCreatedTime] AS DATE) >= CAST(DateAdd(DAY,-30,Getdate()) AS DATE) AND [meetingStatus] = 'Canceled') As 'Canceled',
( SELECT COUNT ([requestType]) FROM [Meeting] WHERE CAST([meetingCreatedTime] AS DATE) >= CAST(DateAdd(DAY,-30,Getdate()) AS DATE) AND [meetingStatus] = 'Denied') As 'Denied'
FROM [Meeting]
WHERE CAST([meetingCreatedTime] AS DATE) >= CAST(DateAdd(DAY,-30,Getdate()) AS DATE) GROUP BY [requestType]
Result:
What I want is:
SELECT
RT.requestType,
SUM(CASE WHEN M.meetingStatus = 'Approved' THEN 1 ELSE 0 END) AS Approved,
SUM(CASE WHEN M.meetingStatus = 'Pending' THEN 1 ELSE 0 END) AS Pending,
SUM(CASE WHEN M.meetingStatus = 'Canceled' THEN 1 ELSE 0 END) AS Canceled,
SUM(CASE WHEN M.meetingStatus = 'Denied' THEN 1 ELSE 0 END) AS Denied,
FROM
(SELECT DISTINCT requestType FROM Meeting) RT
LEFT OUTER JOIN Meeting M ON
M.requestType = RT.requestType AND
M.meetingCreatedTime >= DATEADD(DAY, -30, GETDATE())
GROUP BY
RT.requestType
The SUMs are a much clearer (IMO) and much more efficient way of getting the counts that you need. Using the requestType table (assuming that you have one) lets you get results for every request type even if there are no meetings of that type in the date range. The LEFT OUTER JOIN to the meeting table allows the request type to still show up even if there are no meetings for that time period.
All of your CASTs between date values seem unnecessary.
Move those subqueries into simple sum/case statements:
select rt.request_type,
sum(case when [meetingStatus] = 'Approved' then 1 else 0 end),
sum(case when [meetingStatus] = 'Pending' then 1 else 0 end),
sum(case when [meetingStatus] = 'Canceled' then 1 else 0 end),
sum(case when [meetingStatus] = 'Denied' then 1 else 0 end)
from ( select 'MT' ) rt (request_type) --hopefully you have lookup table for this
left
join [Meeting] m on
rt.request_type = m.request_type and
CAST([meetingCreatedTime] AS DATE) >= CAST(DateAdd(DAY,-30,Getdate()) AS DATE)
group
by rt.request_type;
This is one possible approach to force one line to be visible in any case. Adapt this to your needs...
Copy it into an empty query window and execute... play around with the WHERE part...
DECLARE #Test TABLE (ID INT IDENTITY, GroupingKey VARCHAR(100));
INSERT INTO #Test VALUES ('a'),('a'),('b');
SELECT TOP 1 tbl.CountOfA
,tbl.CountOfB
,tbl.CountOfC
FROM
(
SELECT 1 AS Marker
,(SELECT COUNT(*) FROM #Test WHERE GroupingKey='a') AS CountOfA
,(SELECT COUNT(*) FROM #Test WHERE GroupingKey='b') AS CountOfB
,(SELECT COUNT(*) FROM #Test WHERE GroupingKey='c') AS CountOfC
WHERE (1=1) --play here with (1=0) and (1=1)
UNION ALL
SELECT 2,0,0,0
) AS tbl
ORDER BY Marker

SQL Server query, remove date dimension

I need help in removing the date dimension from the query below. In other words make the query independent of the date / time interval
My goal is to load the table into SSAS so that i would not have to change the date every time i run reports.
the query is huge (months, quarters, years, and aggregated date CR12,PR12 ...), i just gave a short example below
I sincerly appreciate any help
drop table #tmptmp
SELECT *, (DATEDIFF(day, enrollmentsDate, ShipmentDate))
- ((DATEDIFF(WEEK, enrollmentsenttDate, InitialShipmentDate) * 2)
+(CASE WHEN DATENAME(DW, enrollmentsentDate) = 'Sunday' THEN 1 ELSE 0 END)
+(CASE WHEN DATENAME(DW, ShipmentDate) = 'Saturday' THEN 1 ELSE 0 END)
- (select count(*) from tblFactoryHolidayDates where Date >= enrollmentsentDate
and Date < InitialShipmentDate)) as countdays into #tmptmp from
#tmpTouchpointsEnrollments
where EnrollmentSentDate is not null
----------------------------
drop table #tmp
select * into #tmp
from #tmptmp
where countdays < 20
drop table #tmpMetric
Select 'GrandTotal' as Dummy,'Avg days' as Metrics,'1' as MetricOrder,
Sum(case when Year(EnrollmentReceiveddate) ='2010' then (countdays) end) *1.0/
count(case when Year(EnrollmentReceiveddate) ='2010' then (patientID) end) *1.0 as Y2010,
into #tmpMetric
from #tmp
Thank you very much