SQL query with row number and data cleaning there is no validation - sql

I have "prices" table with this data:
id
start_date
end_date
paid
787
1/1/2022
1/15/2022
100
787
1/15/2022
1/22/2022
100,000
787
1/22/2022
2/25/2022
115,000
787
2/25/2022
2/28/2022
14,000
787
2/28/2022
3/05/2022
122,000
787
3/05/2022
3/09/2022
120
423
1/25/2022
2/25/2022
1
423
2/25/2022
2/28/2022
7,500,000
423
3/19/2022
3/22/2022
7,200,000
423
3/25/2022
3/29/2022
1,111,111,111
423
4/13/2022
4/26/2022
999,999,999,999
423
4/26/2022
4/28/2022
999,999,999,99
423
4/28/2022
5/09/2022
7,100,000
423
5/09/2022
5/22/2022
666,666,666,666
I want to stay just with one raw per id with start date ASC and end_date DESC
but the most important is if I have a small number or bigger then others like the price is not between 30% or -30% from the following price take me the last/after price and then its will be the "real" price.
my output:
id
start_date
end_date
first_paid
last_paid
787
1/1/2022
3/09/2022
100,000
122,000
423
1/25/2022
5/22/2022
7,500,000
7,100,000
I created this query but it's with all numbers including the "fake" numbers and I want to clean the "fake" numbers:
select a.id, b.start_date, a.end_date, b.paid as first_paid, a.price as last_paid
from(
select id, start_date, end_date, paid
from (
select *,
row_number() over(partition by id order by end_date desc) rn
from "prices"
)
where rn = 1) as a
left join
(select id, start_date, end_date, paid
from(
select id, start_date, end_date, paid
from (
select *,
row_number() over(partition by id order by start_date) rn
from "prices"
)
where rn = 1)) as b
on a.id = b.id
if someone has a good idea or solution and it will be in presto syntax it will be amazing.
thanks all.

I do hope your explanation was received correctly..
see: DBFIDDLE with some explanations on what is done to get this:
SELECT
x.id,
x.start_date,
x.end_date,
min(x.first_paid) as first_paid,
max(x.last_paid) as last_paid
FROM (
SELECT DISTINCT
id,
min(start_date) over (partition by id) as start_date,
max(end_date) over (partition by id) as end_date,
first_value(paid) over (partition by id order by start_date) as first_paid,
last_value(paid) over (partition by id order by end_date) as last_paid
FROM (
SELECT
id,
start_date,
end_date,
paid as old_paid,
lag(paid) over (partition by id order by start_date) as previus_paid,
CASE WHEN abs(((lag(paid) over (partition by id order by start_date))-paid)
/(lag(paid) over (partition by id order by start_date))) > 30
THEN lag(paid) over (partition by id order by start_date)
ELSE paid END as paid
FROM mytable2 m2
) m1) x
GROUP BY x.id,x.start_Date,x.end_date
;
output:
id
start_date
end_date
first_paid
last_paid
423
2022-01-25
2022-05-22
1.00
99999999999.00
787
2022-01-01
2022-03-09
100.00
122000.00
I now see that I must a made a type somewhere, because 100 seems not be correct...
But I will leave that for you to find out where I wen wrong.....
EDIT: You will have to find a smart way to filter the low and high values that you do not want to include.
Something like:
select * from (
SELECT
id,
paid,
avg(paid) over (partition by id) as avg1,
abs((COALESCE(lead(paid) over (partition by id order by paid),min(paid) over (partition by id)) +
COALESCE(lag(paid) over (partition by id order by paid),max(paid) over (partition by id)) +
paid)/3 - paid ) as avg2
FROM mytable2 m2
) x where x.avg2 < avg1
order by x.id,x.paid
output:
id
paid
avg1
avg2
423
7100000.00
220974947222.000000
2333333.000000
423
7200000.00
220974947222.000000
66666.666666
423
7500000.00
220974947222.000000
367770370.333333
423
1111111111.00
220974947222.000000
32595092592.333333
423
99999999999.00
220974947222.000000
155925925926.333333
423
666666666666.00
220974947222.000000
77777777778.000000
787
100.00
58536.666666
40640.000000
787
120.00
58536.666666
4620.000000
787
14000.00
58536.666666
24040.000000
787
100000.00
58536.666666
23666.666667
787
115000.00
58536.666666
2666.666667
787
122000.00
58536.666666
42966.666667
This filters the 1 and the 999,999,999,999 ....

Related

SQL query to generate summary file base on change in price per item

I need help writing a query to generate a summary file of quantity purchase per item, and per cost from a purchase history file. To run the query the ORDER BY would be ITEM_NO, PO_DATE, AND COST.
SAMPLE DATE - PURCHASE HISTORY
OUTPUT FILE - SUMMARY
We can group by item_no and cost and get all the info we need.
select item_no
,cost
,min(po_date) as start_date
,max(po_date) as end_date
,sum(qty) as qty
from (
select *
,count(chng) over(partition by item_no order by po_date) as grp
from (
select *
,case when lag(cost) over(partition by item_no order by po_date) <> cost then 1 end as chng
from t
) t
) t
group by item_no, cost, grp
order by item_no, start_date
item_no
cost
start_date
end_date
qty
12345
1.25
2021-01-02 00:00:00
2021-01-04 00:00:00
150
12345
2.00
2021-02-01 00:00:00
2021-02-03 00:00:00
60
78945
5.25
2021-06-10 00:00:00
2021-06-12 00:00:00
90
78945
4.50
2021-10-18 00:00:00
2021-10-19 00:00:00
150
Fiddle

How to do a Min and Max of date but following the changes in price points

I'm not really sure how to word this question better so I'll provide the data that I have and the result that I'm after.
This is the data that I have
sku sales qty date
A 100 1 1-Jan-19
A 200 2 2-Jan-19
A 100 1 3-Jan-19
A 240 2 4-Jan-19
A 360 3 5-Jan-19
A 360 4 6-Jan-19
A 200 2 7-Jan-19
A 90 1 8-Jan-19
B 100 1 9-Jan-19
B 200 2 10-Jan-19
And this is the result that I'm after
sku price sum(qty) sum(sales) min(date) max(date)
A 100 4 400 1-Jan-19 3-Jan-19
A 120 5 600 4-Jan-19 5-Jan-19
A 90 4 360 6-Jan-19 6-Jan-19
A 100 2 200 7-Jan-19 7-Jan-19
A 90 1 90 8-Jan-19 8-Jan-19
B 100 3 300 9-Jan-19 10-Jan-19
As you can see, I'm trying to get the min and max date of each price point, where price = sales/qty. At this point, I can get the min and max date of the same price but I can separate it when there's another price in between. I think I have to use some sort of min(date) over (partition by sales/qty order by date) but I can't figure it out yet.
I'm using Redshift SQL
This is a gaps-and-islands query. You can do this by generating a sequence and subtracting that from the date. Then aggregate:
select sku, price, sum(qty), sum(sales),
min(date), max(date)
from (select t.*,
row_number() over (partition by sku, price order by date) as seqnum
from t
) t
group by sku, price, (date - seqnum * interval '1 day')
order by sku, price, min(date);
You can do with Sub Query and LAG
FIDDLE DEMO
SELECT SKU, Price, SUM(Qty) SumQty, SUM(Sales) SumSales, MIN(date) MinDate, MAX(date) MaxDate
FROM (
SELECT SKU,Price,SUM(is_change) OVER(order by SKU, date) is_change,Sales, Qty,date
FROM (SELECT SKU, Sales/Qty AS Price, Sales, Qty,date,
CASE WHEN Sales/Qty = lag(Sales/Qty) over (order by SKU, date)
and SKU = lag(SKU) OVER (order by SKU, date) then 0 ELSE 1 END AS is_change
FROM Tbl
)InnerSelect
) X GROUP BY sku, price,is_change
ORDER BY SKU,MIN(date)
Output

Find the start and end date of stock difference

Please Suggest good sql query to find the start and end date of stock difference
imagine i data in a table like below.
Sample_table
transaction_date stock
2018-12-01 10
2018-12-02 10
2018-12-03 20
2018-12-04 20
2018-12-05 20
2018-12-06 20
2018-12-07 20
2018-12-08 10
2018-12-09 10
2018-12-10 30
Expected result should be
Start_date end_date stock
2018-12-01 2018-12-02 10
2018-12-03 2018-12-07 20
2018-12-08 2018-12-09 10
2018-12-10 null 30
It is the gap and island problem. You may use row_numer and group by for this.
select t.stock, min(transaction_date), max(transaction_date)
from (
select row_number() over (order by transaction_date) -
row_number() over (partition by stock order by transaction_date) grp,
transaction_date,
stock
from data
) t
group by t.grp, t.stock
In the following DBFIDDLE DEMO I solve also the null value of the last group, but the main idea of finding consecutive rows is build on the above query.
You may check this for an explanation of this solution.
You can try below using row_number()
select stock,min(transaction_date) as start_date,
case when min(transaction_date)=max(transaction_date) then null else max(transaction_date) end as end_date
from
(
select *,row_number() over(order by transaction_date)-
row_number() over(partition by stock order by transaction_date) as rn
from t1
)A group by stock,rn
Try to use GROUP BY with MIN and MAX:
SELECT
stock,
MIN(transaction_date) Start_date,
CASE WHEN COUNT(*)>1 THEN MAX(transaction_date) END end_date
FROM Sample_table
GROUP BY stock
ORDER BY stock
You can try with LEAD, LAG functions as below:
select currentStockDate as startDate,
LEAD(currentStockDate,1) as EndDate,
currentStock
from
(select *
from
(select
LAG(transaction_date,1) over(order by transaction_date) as prevStockDate,
transaction_date as CurrentstockDate,
LAG(stock,1) over(order by transaction_date) as prevStock,
stock as currentStock
from sample_table) as t
where (prevStock <> currentStock) or (prevStock is null)
) as t2

Insert the table data based on grouping of two columns

I have a oracle table with the following format,
For eg:
JLID Dcode SID TDT QTY
8295783 3119255 9842 3/5/2018 14
8269771 3119255 9842 3/6/2018 11
8302211 3119255 1126 3/1/2018 19
Here I have different SID for the same Dcode, now I need to get the SID with the maximum Qty. (i.e) for SID 9842 - (14+11)=25, for SID 1126 it is 19, then the results should be on SID 9842. So, our query should returns the following results
JLID Dcode START_DT END_DT SID
111 3119255 3/1/2018 3/31/2018 12:00 9842
Startdate and enddate should be calculated from TDT (i.e) start date is the first date of the month and the end date is the last date of the month
Can anyone please suggest me some ideas to do it.
It might be as simple as this:
SELECT Dcode, start_date, end_date, SID FROM (
SELECT Dcode, SID, TRUNC(start_date, 'MONTH') AS start_date
, LAST_DAY(end_date) AS end_date
, ROW_NUMBER() OVER ( PARTITION BY Dcode ORDER BY total_qty DESC ) AS rn
FROM (
SELECT Dcode, SID, MIN(TDT) AS start_date, MAX(TDT) AS end_date
, SUM(QTY) AS total_qty
FROM mytable
GROUP BY Dcode, SID
)
) WHERE rn = 1
In the inner most subquery I aggregation to get the range of dates and total quantity for particular values of Dcode and SID. Then I use an anaylitic (window) function to get the row for which total quantity is the greatest. (You would want to use RANK() in place of ROW_NUMBER() in the event you want to return more than one value of SID with the same quantity.)
Here's one option which doesn't contain JLID = 111 in the final result as I have no idea where you took it from.
SQL> with test (jlid, dcode, sid, tdt, qty) as
2 (select 8295783, 3119255, 9842, date '2018-03-05', 14 from dual union
3 select 8269771, 3119255, 9842, date '2018-08-22', 11 from dual union
4 select 8302211, 3119255, 1126, date '2018-03-01', 19 from dual union
5 --
6 select 1234567, 1112223, 1000, date '2018-06-16', 88 from dual
7 )
8 select dcode,
9 min (trunc (tdt, 'mm')) start_dt, --> MIN
10 max (last_day (tdt)) end_dt, --> MAX
11 sid
12 from (select dcode,
13 sid,
14 tdt,
15 sqty,
16 rank () over (partition by dcode order by sqty desc) rnk
17 from (select dcode,
18 sid,
19 tdt,
20 sum (qty) over (partition by dcode, sid) sqty
21 from test))
22 where rnk = 1
23 group by dcode, sid; --> GROUP BY
DCODE START_DT END_DT SID
---------- ---------------- ---------------- ----------
1112223 01.06.2018 00:00 30.06.2018 00:00 1000
3119255 01.03.2018 00:00 31.08.2018 00:00 9842
SQL>

SQL: Get date when cumulative sum reaches a mark

I have a table in the following format:
APP_iD| Date | Impressions
113 2015-01-01 10
113 2015-01-02 5
113 2015-01-03 50
113 2015-01-04 35
113 2015-01-05 30
113 2015-01-06 75
Now, I need to know the date when cumulative SUM of those impressions crossed 65/100/150 and so on.
I tried using CASE WHEN statement:
CASE WHEN SUM(impressions) >100
THEN date
but it doesn't sum the data across the column. It just does checks against the individual row.
My final result should look like:
APP_iD | Date_65 | Date_100 | Date_150
113 2015-01-03 2015-01-04 2015-01-06
Does anyone know how to do this?
Is this even possible?
Use sum() over() to get the running sum and check for the required values with a case expression. Finally aggregate the results to get one row per each app_id.
select app_id,max(dt_65),max(dt_100),max(dt_150)
from (
select app_id
,case when sum(impressions) over(partition by app_id order by dt) between 65 and 99 then dt end dt_65
,case when sum(impressions) over(partition by app_id order by dt) between 100 and 149 then dt end dt_100
,case when sum(impressions) over(partition by app_id order by dt) >= 150 then dt end dt_150
from t) x
group by app_id
with c as (
select
app_id, date,
sum(impressions) over (partition by app_id order by date) as c
from t
)
select app_id, s65.date, s100.date, s150.date
from
(
select distinct on (app_id) app_id, date
from c
where c >= 65 and c < 100
order by app_id, date
) s65
left join
(
select distinct on (app_id) app_id, date
from c
where c >= 100 and c <150
order by app_id, date
) s100 using (app_id)
left join
(
select distinct on (app_id) app_id, date
from c
where c >= 150
order by app_id, date
) s150 using (app_id)
;
app_id | date | date | date
--------+------------+------------+------------
113 | 2015-01-03 | 2015-01-04 | 2015-01-06
Without the pivot:
select distinct on (app_id, break) app_id, break, date
from (
select *,
case
when c < 100 then 65
when c < 150 then 100
else 150
end as break
from (
select
app_id, date,
sum(impressions) over (partition by app_id order by date) as c
from t
) t
where c >= 65
) t
order by app_id, break, date
;
app_id | break | date
--------+-------+------------
113 | 65 | 2015-01-03
113 | 100 | 2015-01-04
113 | 150 | 2015-01-06
You can try this for desired result.
with t as (select app_id, date, sum(Impressions)
over (partition by app_id order by date) AS s from tbl)
select app_id,
min(date_65) AS date_65 ,
min(date_100) AS date_100,
min(date_150) AS date_150
-- more columns to observe other sum of Impressions
from
(select app_id,
CASE WHEN (s >= 65 and s < 100) THEN date END AS date_65,
CASE WHEN (s >= 100 and s < 150) THEN date END AS date_100,
CASE WHEN (s >= 150 ) THEN date END AS date_150
-- more cases to observe other sum of Impressions
from t) q
group by q.app_id
if you want to observe more sum of Impressions, just add more conditions