SQL not using ALIAS column for calculation - sql

Question Statement - From the given trips and users tables for a taxi service, write a query to return the cancellation rate in the first two days in October, rounded to two decimal places, for trips not involving banned riders or drivers.
Question code on Oracle SQL.
create table trips (trip_id int, rider_id int, driver_id int, status varchar2(200), request_date date);
insert into trips values (1, 1, 10, 'completed', to_date ('2020-10-01', 'YYYY/MM/DD'));
insert into trips values (2, 2, 11, 'cancelled_by_driver', to_date ('2020-10-01', 'YYYY/MM/DD'));
insert into trips values (3, 3, 12, 'completed', to_date ('2020-10-01', 'YYYY/MM/DD'));
insert into trips values (4, 4, 10, 'cancelled_by_driver', to_date ('2020-10-02', 'YYYY/MM/DD'));
insert into trips values (5, 1, 11, 'completed', to_date ('2020-10-02', 'YYYY/MM/DD'));
insert into trips values (6, 2, 12, 'completed', to_date ('2020-10-02', 'YYYY/MM/DD'));
insert into trips values (7, 3, 11, 'completed', to_date ('2020-10-03', 'YYYY/MM/DD'));
create table users (user_id int, banned varchar2(200), type varchar2(200));
insert into users values (1, 'no', 'rider');
insert into users values (2, 'yes', 'rider');
insert into users values (3, 'no', 'rider');
insert into users values (4, 'no', 'rider');
insert into users values (10, 'no', 'driver');
insert into users values (11, 'no', 'driver');
insert into users values (12, 'no', 'driver');
My Solution Code is below. However, I get the following error. Can someone pleas help?
ORA-00904: "TOTAL_TRIPS": invalid identifier
SOLUTION CODE:
select request_date, (1-(trips_completed/total_trips)) as "cancel_rate"
from
((
select request_date,
sum(case when status = 'completed' then 1 else 0 end) as "trips_completed",
sum(case when status = 'cancelled_by_driver' then 1 else 0 end) as "trips_cancelled",
sum(case when status = 'cancelled_by_driver' then 1 when status= 'completed' then 1 else 0 end) as "total_trips"
from
(
select t.rider_id, t.driver_id, t.status, t.request_date, u.banned as "not_banned_rider", u.banned as "not_banned_driver"
from trips t
join users u
on t.rider_id=u.user_id
where u.banned='no'
)
group by request_date
having request_date <> to_date ('2020-10-03', 'YYYY/MM/DD')
));

First, don't put identifiers in double quotes. They just clutter up queries.
Some other things to fix:
No need for two levels of subqueries.
Learn to use proper date literal syntax.
I think you want < rather than <>.
So that suggests:
select request_date, (1-(trips_completed/total_trips)) as cancel_rate
from (select request_date,
sum(case when status = 'completed' then 1 else 0 end) as trips_completed,
sum(case when status = 'cancelled_by_driver' then 1 else 0 end) as trips_cancelled,
sum(case when status = 'cancelled_by_driver' then 1 when status = 'completed' then 1 else 0 end) as total_trips
from trips t join
users u
on t.rider_id = u.user_id
where u.banned = 'no' and
t.request_date < date '2020-10-03'
group by request_date
) rd;
This can be further simplified using avg():
select request_date,
avg(case when status = 'completed' then 1 else 0 end) as cancel_rate
from trips t join
users u
on t.rider_id = u.user_id
where u.banned = 'no' and
request_date < date '2020-10-03'
group by request_date ;
Note: This addresses fixing the query in your question. It doesn't actually correctly answer the question, for the following reasons:
I'm pretty sure the question entails one cancellation rate, not one for two dates.
It doesn't take into account banned drivers.
I'm not sure how "cancelled by user" would be handled.

ORA-00904: "TOTAL_TRIPS": invalid identifier
just means what is written "total_trips" is invalid
Just use total_trips (without quote)

Related

Get userwise balance and first transaction date of users in SQL

I have created a Transaction table with columns card_id, amount, created_at. There may be more than 1 row of one user so I want to return the value card_id, sum(amount), first created_at date of all users.
CREATE TABLE Transactions(card_id int, amount money, created_at date)
INSERT INTO Transactions(card_id, amount, created_at)
SELECT 1, 500, '2016-01-01' union all
SELECT 1, 100, '2016-01-01' union all
SELECT 1, 100, '2016-01-01' union all
SELECT 1, 200, '2016-01-02' union all
SELECT 1, 300, '2016-01-03' union all
SELECT 2, 100, '2016-01-04' union all
SELECT 2, 200, '2016-01-05' union all
SELECT 3, 700, '2016-01-06' union all
SELECT 1, 100, '2016-01-07' union all
SELECT 2, 100, '2016-01-07' union all
SELECT 3, 100, '2016-01-07'
I have created function for that but one of my client says I need query not function. Can anyone here suggest what query to use?
CREATE FUNCTION [dbo].[card_id_data]()
RETURNS #t TABLE
(
card_id text,
amount money,
dateOfFirstTransaction date
)
AS
BEGIN
INSERT INTO #t(card_id)
SELECT DISTINCT(card_id) FROM Transactions;
UPDATE #t
SET dateOfFirstTransaction = b.createdat
FROM
(SELECT DISTINCT(card_id) cardid,
MIN(created_at) createdat
FROM Transactions
WHERE amount < 0
GROUP BY card_id) b
WHERE card_id = b.cardid;
UPDATE #t
SET amount = T.AMOUNT
FROM
(SELECT
card_id AS cardid, SUM(MIN(AMOUNT)) AMOUNT, created_at
FROM Transactions
WHERE amount < 0
GROUP BY card_id, created_at) T
WHERE card_id = cardid
AND dateOfFirstTransaction = created_at;
RETURN
END
I want a result as shown in this screenshot:
You can use DENSE_RANK for this. It will number the rows, taking into account tied places (same dates)
SELECT
t.card_id,
SumAmount = SUM(amount),
FirstDate = MIN(t.created_at)
FROM (
SELECT *,
rn = DENSE_RANK() OVER (PARTITION BY t.card_id ORDER BY t.created_at)
FROM dbo.Transactions t
) t
WHERE t.rn = 1
GROUP BY t.card_id;
If the dates are actually dates and times, and you want to sum the whole day, change t.created_at to CAST(t.created_at AS date)
Try this:
/*
CREATE TABLE dbo.Transactions
(
card_id INT,
amount MONEY,
created_at DATE
);
INSERT INTO dbo.Transactions (card_id, amount, created_at)
VALUES (1, 500, '2016-01-01'),
(1, 100, '2016-01-01'),
(1, 100, '2016-01-01'),
(1, 200, '2016-01-02'),
(1, 300, '2016-01-03'),
(2, 100, '2016-01-04'),
(2, 200, '2016-01-05'),
(3, 700, '2016-01-06'),
(1, 100, '2016-01-07'),
(2, 100, '2016-01-07'),
(3, 100, '2016-01-07');
*/
WITH FirstDatePerCard AS
(
SELECT
card_id,
FirstDate = MIN(created_at)
FROM
dbo.Transactions
GROUP BY
card_id
)
SELECT DISTINCT
t.card_id,
SumAmount = SUM(amount) OVER (PARTITION BY t.card_id),
FirstDate = f.FirstDate
FROM
FirstDatePerCard f
INNER JOIN
dbo.Transactions t ON f.card_id = t.card_id AND f.FirstDate = t.created_at
You'll get an output something like this:
card_id SumAmount FirstDate
--------------------------------
1 700.00 2016-01-01
2 100.00 2016-01-04
3 700.00 2016-01-06
Is that what you're looking for??
UPDATE: OK, so you want to sum the amount only for the first_date, for every card_id - is that correct? (wasn't clear from the original question)
Updated my solution accordingly

sql that finds records within 3 days of a condition being met

I am trying to find all records that exist within a date range prior to an event occurring. In my table below, I want to pull all records that are 3 days or less from when the switch field changes from 0 to 1, ordered by date, partitioned by product. My solution does not work, it includes the first record when it should skip as it's outside the 3 day window. I am scanning a table with millions of records, is there a way to reduce the complexity/cost while maintaining my desired results?
http://sqlfiddle.com/#!18/eebe7
CREATE TABLE productlist
([product] varchar(13), [switch] int, [switchday] date)
;
INSERT INTO productlist
([product], [switch], [switchday])
VALUES
('a', 0, '2019-12-28'),
('a', 0, '2020-01-02'),
('a', 1, '2020-01-03'),
('a', 0, '2020-01-06'),
('a', 0, '2020-01-07'),
('a', 1, '2020-01-09'),
('a', 1, '2020-01-10'),
('a', 1, '2020-01-11'),
('b', 1, '2020-01-01'),
('b', 0, '2020-01-02'),
('b', 0, '2020-01-03'),
('b', 1, '2020-01-04')
;
my solution:
with switches as (
SELECT
*,
case when lead(switch) over (partition by product order by switchday)=1
and switch=0 then 'first day switch'
else null end as leadswitch
from productlist
),
switchdays as (
select * from switches
where leadswitch='first day switch'
)
select pl.*
,'lead'
from productlist pl
left join switchdays ss
on pl.product=ss.product
and pl.switchday = ss.switchday
and datediff(day, pl.switchday, ss.switchday)<=3
where pl.switch=0
desired output, capturing records that occur within 3 days of a switch going from 0 to 1, for each product, ordered by date:
product switch switchday
a 0 2020-01-02 lead
a 0 2020-01-06 lead
a 0 2020-01-07 lead
b 0 2020-01-02 lead
b 0 2020-01-03 lead
If I understand correctly, you can just use lead() twice:
select pl.*
from (select pl.*,
lead(switch) over (partition by product order by switchday) as next_switch_1,
lead(switch, 2) over (partition by product order by switchday) as next_switch_2
from productlist pl
) pl
where switch = 0 and
1 in (next_switch_1, next_switch_2);
Here is a db<>fiddle.
EDIT (based on comment):
select pl.*
from (select pl.*,
min(case when switch = 1 then switchdate end) over (partition by product order by switchdate desc) as next_switch_1_day
from productlist pl
) pl
where switch = 0 and
next_switch_one_day <= dateadd(day, 2, switchdate);

SQL SUM stop if hit a threshold

I am doing a back testing where I need to calculate each store's losses if I apply a $1000 threshold block on the sale_amount.
For example, store_id = a, the first two rows add up to 700, but the third transaction = $400 will still go through, total 700+ 400 =1100, then a batch process to run and trigger the 1000 block, so the 4rd trans get blocked, what I need to calculate is the all the amount after the block being triggered, which is store_id = a , which is $99. store b is $800+100+50
This is my sample data, please advise how to use temporary table to solve this
Create table stadium
(
Trans_id int,
Store_id varchar,
sale_amount int
)
insert into stadium (Trans_id, Store_id, sale_amount) values ('1', 'a', '500')
insert into stadium (Trans_id, Store_id, sale_amount) values ('2', 'a', '200')
insert into stadium (Trans_id, Store_id, sale_amount) values ('3', 'a', '400')
insert into stadium (Trans_id, Store_id, sale_amount) values ('4', 'a', '99')
insert into stadium (Trans_id, Store_id, sale_amount) values ('5', 'b', '700')
insert into stadium (Trans_id, Store_id, sale_amount) values ('6', 'b', '100')
insert into stadium (Trans_id, Store_id, sale_amount) values ('7', 'b', '800')
insert into stadium (Trans_id, Store_id, sale_amount) values ('8', 'b', '100')
insert into stadium (Trans_id, Store_id, sale_amount) values ('9', 'b', '50')
I think this requires a recursive CTE -- because you hit a threshold, but skip and keep on going.
So, this does what you describe:
with s as (
select s.*, row_number() over (partition by store_id order by trans_id) as seqnum
from stadium s
),
cte as (
select store_id, trans_id, sale_amount as running_amount, 1 as include_flag, seqnum
from s
where seqnum = 1
union all
select s.store_id, s.trans_id,
(case when s.sale_amount + cte.running_amount <= 1000 then s.sale_amount + cte.running_amount else cte.running_amount end),
(case when s.sale_amount + cte.running_amount <= 1000 then 1 else 0 end),
s.seqnum
from cte join
s
on s.store_id = cte.store_id and s.seqnum = cte.seqnum + 1
)
select s.*
from cte join
s
on s.trans_id = cte.trans_id
where include_flag = 0;
Here is a db<>fiddle.

SQL: select data before first occurence of a certain value

For example, I have order data come from customers, like this
test = spark.createDataFrame([
(0, 1, 1, "2018-06-03"),
(1, 1, 1, "2018-06-04"),
(2, 1, 3, "2018-06-04"),
(3, 1, 2, "2018-06-05"),
(4, 1, 1, "2018-06-06"),
(5, 2, 3, "2018-06-01"),
(6, 2, 1, "2018-06-01"),
(7, 3, 1, "2018-06-02"),
(8, 3, 1, "2018-06-02"),
(9, 3, 1, "2018-06-05")
])\
.toDF("order_id", "customer_id", "order_status", "created_at")
test.show()
Each order has its own status, 1 means newly created but not finished, 3 means it's payed and finished.
Now, I want to do analysis for order comes from
new customers (who has not made purchase before)
old customers (who has finished purchase before)
so I want to add a feature to the above the data, turn into like this
The logic is for every customer, every order created before first order with status 3 (include itself) is counted as come from new customer, and every order after that is counted as old customer.
Or put it into another way, select the data before the first occurance of value 3 (for each customer's order, sort by date asc)
How can I do this, in SQL?
I searched around but didn't find good solution. If in Python, I think maybe I'll simply do some loop to get the values.
This is tested for SQLite:
SELECT order_id, customer_id, order_status, created_at,
CASE
WHEN order_id > (SELECT MIN(order_id) FROM orders WHERE customer_id = o.customer_id AND order_status = 3) THEN 'old'
ELSE 'new'
END AS customer_status
FROM orders o
You can do this using window functions in Spark:
select t.*,
(case when created_at > min(case when status = 3 then created_at end) over (partition by customer_id)
then 'old'
else 'new'
end) as customer_status
from test t;
Note that this assigns "new" to customers with no order with status "3".
You can also write this using join and group by:
select t.*,
coalesce(t3.customer_status, 'old') as customer_status
from test t left join
(select t.customer_id, min(created_at) as min_created_at,
'new' as customer_status
from t
where status = 3
group by t.customer_id
) t3
on t.customer_id = t3.customer_id and
t.created_at <= t3.min_created_at;

Use recursive CTE to handle date logic

At work, one of my assignments is to calculate commission to the sales staff. One rule has been more challenging than the others.
Two sales teams A and B work together each selling different products. Team A can send leads to team B. The same customer might be send multiple times. The first time a customer (ex. lead 1)* is send a commission is paid to the salesperson in team A who created the lead. Now the customer is “locked” for the next 365 days (counting from the date lead 1 was created). Meaning that no one can get additional commission for that customer in that period by sending additional leads (ex. Lead 2 and 3 gets no commission). After the 365 days have expired. A new lead can be created and receive commission (ex. Lead 4). Then the customer is locked again for 365 days counting from the day lead 4 was created. Therefore, lead 5 gets no commission. The tricky part is to reset the date that the 365 days is counted from.
'* Reference to tables #LEADS and #DISERED result.
I have solved the problem in tSQL using a cursor, but I wonder if it was possible to use a recursive CTE instead. I have made several attempts the best one is pasted in below. The problem with my solution is, that I refer to the recursive table more than once. I have tried to fix that problem with nesting a CTE inside a CTE. That’s is not allowed. I have tried using a temporary table inside the CTE that is not allowed either. I tried several times to recode the recursive part of the CTE so that the recursive table is referenced only once, but then I am not able to get the logic to work.
I am using SQL 2008
IF OBJECT_ID('tempdb.dbo.#LEADS', 'U') IS NOT NULL
DROP TABLE #LEADS;
CREATE TABLE #LEADS (LEAD_ID INT, CUSTOMER_ID INT, LEAD_CREATED_DATE DATETIME, SALESPERSON_NAME varchar(20))
INSERT INTO #LEADS
VALUES (1, 1, '2013-09-01', 'Rasmus')
,(2, 1, '2013-11-01', 'Christian')
,(3, 1, '2014-01-01', 'Nadja')
,(4, 1, '2014-12-24', 'Roar')
,(5, 1, '2015-12-01', 'Kristian')
,(6, 2, '2014-01-05', 'Knud')
,(7, 2, '2015-01-02', 'Rasmus')
,(8, 2, '2015-01-08', 'Roar')
,(9, 2, '2016-02-05', 'Kristian')
,(10, 2, '2016-03-05', 'Casper')
SELECT *
FROM #LEADS;
IF OBJECT_ID('tempdb.dbo.#DISERED_RESULT', 'U') IS NOT NULL
DROP TABLE #DISERED_RESULT;
CREATE TABLE #DISERED_RESULT (LEAD_ID INT, DESIRED_COMMISION_RESULT CHAR(3))
INSERT INTO #DISERED_RESULT
VALUES (1, 'YES')
,(2, 'NO')
,(3, 'NO')
,(4, 'YES')
,(5, 'NO')
,(6, 'YES')
,(7, 'NO')
,(8, 'YES')
,(9, 'YES')
,(10, 'NO')
SELECT *
FROM #DISERED_RESULT;
WITH COMMISSION_CALCULATION AS
(
SELECT T1.*
,COMMISSION = 'YES'
,MIN_LEAD_CREATED_DATE AS COMMISSION_DATE
FROM #LEADS AS T1
INNER JOIN (
SELECT A.CUSTOMER_ID
,MIN(A.LEAD_CREATED_DATE) AS MIN_LEAD_CREATED_DATE
FROM #LEADS AS A
GROUP BY A.CUSTOMER_ID
) AS T2 ON T1.CUSTOMER_ID = T2.CUSTOMER_ID AND T1.LEAD_CREATED_DATE = T2.MIN_LEAD_CREATED_DATE
UNION ALL
SELECT T10.LEAD_ID
,T10.CUSTOMER_ID
,T10.LEAD_CREATED_DATE
,T10.SALESPERSON_NAME
,T10.COMMISSION
,T10.COMMISSION_DATE
FROM (SELECT ROW_NUMBER() OVER(PARTITION BY T5.CUSTOMER_ID ORDER BY T5.LEAD_CREATED_DATE ASC) AS RN
,T5.*
,T6.MAX_COMMISSION_DATE
,DATEDIFF(DAY, T6.MAX_COMMISSION_DATE, T5.LEAD_CREATED_DATE) AS ANTAL_DAGE_SIDEN_SIDSTE_COMMISSION
,CASE
WHEN DATEDIFF(DAY, T6.MAX_COMMISSION_DATE, T5.LEAD_CREATED_DATE) > 365 THEN 'YES'
ELSE 'NO'
END AS COMMISSION
,CASE
WHEN DATEDIFF(DAY, T6.MAX_COMMISSION_DATE, T5.LEAD_CREATED_DATE) > 365 THEN T5.LEAD_CREATED_DATE
ELSE NULL
END AS COMMISSION_DATE
FROM #LEADS AS T5
INNER JOIN (SELECT T4.CUSTOMER_ID
,MAX(T4.COMMISSION_DATE) AS MAX_COMMISSION_DATE
FROM COMMISSION_CALCULATION AS T4
GROUP BY T4.CUSTOMER_ID) AS T6 ON T5.CUSTOMER_ID = T6.CUSTOMER_ID
WHERE T5.LEAD_ID NOT IN (SELECT LEAD_ID FROM COMMISSION_CALCULATION)
) AS T10
WHERE RN = 1
)
SELECT *
FROM COMMISSION_CALCULATION;
I have made some assumptions where your description does not fully make sense as written, but the below achieves your desired result:
if object_id('tempdb.dbo.#leads', 'u') is not null
drop table #leads;
create table #leads (lead_id int, customer_id int, lead_created_date datetime, salesperson_name varchar(20))
insert into #leads
values (1, 1, '2013-09-01', 'rasmus')
,(2, 1, '2013-11-01', 'christian')
,(3, 1, '2014-01-01', 'nadja')
,(4, 1, '2014-12-24', 'roar')
,(5, 1, '2015-12-01', 'kristian')
,(6, 2, '2014-01-05', 'knud')
,(7, 2, '2015-01-02', 'rasmus')
,(8, 2, '2015-01-08', 'roar')
,(9, 2, '2016-02-05', 'kristian')
,(10, 2, '2016-03-05', 'casper')
if object_id('tempdb.dbo.#disered_result', 'u') is not null
drop table #disered_result;
create table #disered_result (lead_id int, desired_commision_result char(3))
insert into #disered_result
values (1, 'yes'),(2, 'no'),(3, 'no'),(4, 'yes'),(5, 'no'),(6, 'yes'),(7, 'no'),(8, 'yes'),(9, 'yes'),(10, 'no')
with rownum
as
(
select row_number() over (order by customer_id, lead_created_date) as rn -- This is to ensure an incremantal ordering id
,lead_id
,customer_id
,lead_created_date
,salesperson_name
from #leads
)
,cte
as
(
select rn
,lead_id
,customer_id
,lead_created_date
,salesperson_name
,'yes' as commission_result
,lead_created_date as commission_window_start
from rownum
where rn = 1
union all
select r.rn
,r.lead_id
,r.customer_id
,r.lead_created_date
,r.salesperson_name
,case when r.customer_id <> c.customer_id -- If the customer ID has changed, we are at a new commission window.
then 'yes'
else case when r.lead_created_date > dateadd(d,365,c.commission_window_start) -- This assumes the window is 365 days and not one year (ie. Leap years don't add a day)
then 'yes'
else 'no'
end
end as commission_result
,case when r.customer_id <> c.customer_id
then r.lead_created_date
else case when r.lead_created_date > dateadd(d,365,c.commission_window_start) -- This assumes the window is 365 days and not one year (ie. Leap years don't add a day)
then r.lead_created_date
else c.commission_window_start
end
end as commission_window_start
from rownum r
inner join cte c
on(r.rn = c.rn+1)
)
select lead_id
,commission_result
from cte
order by customer_id
,lead_created_date;