SQL - find row with closest date but different column value - sql

i'm new to SQL and i would need an help.
I have a TAB and I need to find for any item B in the TAB the item A with the closest date. In this case the A with 02.09.2021 04:25:30
Date.
Item
07.09.2021 05:02:05
A
06.09.2021 05:01:02
A
05.09.2021 05:00:02
A
04.09.2021 04:59:01
A
03.09.2021 04:58:03
A
02.09.2021 04:56:55
A
02.09.2021 04:33:56
B
02.09.2021 04:25:30
A

WITH CTE(DATE,ITEM)AS
(
SELECT '20210907 05:02:05' , 'A'UNION ALL
SELECT '20210906 05:01:02' , 'A'UNION ALL
SELECT '20210905 05:00:02' , 'A'UNION ALL
SELECT'20210904 04:59:01' , 'A'UNION ALL
SELECT'20210903 04:58:03' , 'A'UNION ALL
SELECT'20210902 04:56:55' , 'A'UNION ALL
SELECT'20210902 04:33:56' , 'B'UNION ALL
SELECT'20210902 04:25:30' , 'A'
)
SELECT
CAST(C.DATE AS DATETIME)X_DATE,C.ITEM,Q.CLOSEST
FROM CTE AS C
OUTER APPLY
(
SELECT TOP 1 CAST(X.DATE AS DATETIME)CLOSEST
FROM CTE AS X
WHERE X.ITEM='A'AND CAST(X.DATE AS DATETIME)<CAST(C.DATE AS DATETIME)
ORDER BY CAST(X.DATE AS DATETIME) ASC
)Q
WHERE C.ITEM='B'
You can use OUTER APPLY-approach as in the above query.
Please also take a look that datetime-column (DATE)is written in the ISO-compliant form

Your data has only two columns. If you want the only the closest A timestamp, then the fastest way is probably window functions:
select t.*,
(case when prev_a_date is null then next_a_date
when next_a_date is null then prev_a_date
when datediff(second, prev_a_date, date) <= datediff(second, date, next_a_date) then prev_a_date
else next_a_date
end) as a_date
from (select t.*,
max(case when item = 'A' then date end) over (order by date) as prev_a_date,
min(case when item = 'A' then date end) over (order by date desc) as next_a_date
from t
) t
where item = 'B';
This uses seconds to measure the time difference, but you can use a smaller unit if appropriate.
You can also do this using apply if you have more columns from the "A" rows that you want:
select tb.*, ta.*
from t b outer apply
(select top (1) ta.*
from t ta
where item = 'A'
order by abs(datediff(second, a.date, b.date))
) t
where item = 'B';

Related

Identify date range and merge into max and min dates

I have data ( int, date , date types)
SELECT * FROM
(
VALUES
(1700171048,'2020-12-21','2021-01-03'),
(1700171048,'2021-01-05','2021-01-12'),
(1700171048,'2021-01-13','2021-01-17'),
(1700171048,'2021-01-18','2021-01-19'),
(1700171048,'2021-01-22','2021-01-27'),
(1700171048,'2021-01-28','2021-02-17')
(1700171049,'2020-12-21','2021-01-03'),
(1700171049,'2021-01-04','2021-01-05'),
(1700171049,'2021-01-06','2021-01-17'),
(1700171049,'2021-01-18','2021-01-19'),
(1700171049,'2021-01-20','2021-01-27'),
(1700171049,'2021-01-28','2021-02-17')
) AS c (id1, st, endt )
I need output( i.e. if start and end dates are continuous then make it part of group )
id1 st endt
1700171048 '2020-12-21' , '2021-01-03'
1700171048 '2021-01-05' , '2021-01-19'
1700171048 '2021-01-22' , '2021-02-17'
1700171049 '2020-12-21' to '2021-02-17'
I tried this, won't work.
select id, case when min(b.st) = max(b.endt) + 1 then min(b.st) end,
case when min(b.endt) = min(b.st) + 1 then max(b.st) end
from c a join c b
group by id
This is a type of gaps-and-islands problem. Use lag() to identify if there is an overlap. Then a cumulative sum of when there is no overlaps and aggregation:
select id1, min(st), max(endt)
from (select t.*,
sum(case when prev_endt >= st + interval '-1 day' then 0 else 1 end) over (partition by id1 order by st) as grp
from (select t.*,
lag(endt) over (partition by id1 order by st) as prev_endt
from t
) t
) t
group by id1, grp;
Here is a db<>fiddle.

conditional running sum

I'm trying to return the number of unique users that converted over time.
So I have the following query:
WITH CTE
As
(
SELECT '2020-04-01' as date,'userA' as user,1 as goals Union all
SELECT '2020-04-01','userB',0 Union all
SELECT '2020-04-01','userC',0 Union all
SELECT '2020-04-03','userA',1 Union all
SELECT '2020-04-05','userC',1 Union all
SELECT '2020-04-06','userC',0 Union all
SELECT '2020-04-06','userB',0
)
select
date,
COUNT(DISTINCT
IF
(goals >= 1,
user,
NULL)) AS cad_converters
from CTE
group by date
I'm trying to count distinct user but I need to find a way to apply the distinct count to the whole date. I probably need to do something like a cumulative some...
expected result would be something like this
date, goals, total_unique_converted_users
'2020-04-01',1,1
'2020-04-01',0,1
'2020-04-01',0,1
'2020-04-03',1,2
'2020-04-05',1,2
'2020-04-06',0,2
'2020-04-06',0,2
Below is for BigQuery Standard SQL
#standardSQL
SELECT t.date, t.goals, total_unique_converted_users
FROM `project.dataset.table` t
LEFT JOIN (
SELECT a.date,
COUNT(DISTINCT IF(b.goals >= 1, b.user, NULL)) AS total_unique_converted_users
FROM `project.dataset.table` a
CROSS JOIN `project.dataset.table` b
WHERE a.date >= b.date
GROUP BY a.date
)
USING(date)
I would approach this by tagging when the first goal is scored for each name. Then simply do a cumulative sum:
select cte.* except (seqnum), countif(seqnum = 1) over (order by date)
from (select cte.*,
(case when goals = 1 then row_number() over (partition by user, goals order by date) end) as seqnum
from cte
) cte;
I realize this can be expressed without the case in the subquery:
select cte.* except (seqnum), countif(seqnum = 1 and goals = 1) over (order by date)
from (select cte.*,
row_number() over (partition by user, goals order by date) as seqnum
from cte
) cte;

How to get the validity date range of a price from individual daily prices in SQL

I have some prices for the month of January.
Date,Price
1,100
2,100
3,115
4,120
5,120
6,100
7,100
8,120
9,120
10,120
Now, the o/p I need is a non-overlapping date range for each price.
price,from,To
100,1,2
115,3,3
120,4,5
100,6,7
120,8,10
I need to do this using SQL only.
For now, if I simply group by and take min and max dates, I get the below, which is an overlapping range:
price,from,to
100,1,7
115,3,3
120,4,10
This is a gaps-and-islands problem. The simplest solution is the difference of row numbers:
select price, min(date), max(date)
from (select t.*,
row_number() over (order by date) as seqnum,
row_number() over (partition by price, order by date) as seqnum2
from t
) t
group by price, (seqnum - seqnum2)
order by min(date);
Why this works is a little hard to explain. But if you look at the results of the subquery, you will see how the adjacent rows are identified by the difference in the two values.
SELECT Lag.price,Lag.[date] AS [From], MIN(Lead.[date]-Lag.[date])+Lag.[date] AS [to]
FROM
(
SELECT [date],[Price]
FROM
(
SELECT [date],[Price],LAG(Price) OVER (ORDER BY DATE,Price) AS LagID FROM #table1 A
)B
WHERE CASE WHEN Price <> ISNULL(LagID,1) THEN 1 ELSE 0 END = 1
)Lag
JOIN
(
SELECT [date],[Price]
FROM
(
SELECT [date],Price,LEAD(Price) OVER (ORDER BY DATE,Price) AS LeadID FROM [#table1] A
)B
WHERE CASE WHEN Price <> ISNULL(LeadID,1) THEN 1 ELSE 0 END = 1
)Lead
ON Lag.[Price] = Lead.[Price]
WHERE Lead.[date]-Lag.[date] >= 0
GROUP BY Lag.[date],Lag.[price]
ORDER BY Lag.[date]
Another method using ROWS UNBOUNDED PRECEDING
SELECT price, MIN([date]) AS [from], [end_date] AS [To]
FROM
(
SELECT *, MIN([abc]) OVER (ORDER BY DATE DESC ROWS UNBOUNDED PRECEDING ) end_date
FROM
(
SELECT *, CASE WHEN price = next_price THEN NULL ELSE DATE END AS abc
FROM
(
SELECT a.* , b.[date] AS next_date, b.price AS next_price
FROM #table1 a
LEFT JOIN #table1 b
ON a.[date] = b.[date]-1
)AA
)BB
)CC
GROUP BY price, end_date

Group by in columns and rows, counts and percentages per day

I have a table that has data like following.
attr |time
----------------|--------------------------
abc |2018-08-06 10:17:25.282546
def |2018-08-06 10:17:25.325676
pqr |2018-08-05 10:17:25.366823
abc |2018-08-06 10:17:25.407941
def |2018-08-05 10:17:25.449249
I want to group them and count by attr column row wise and also create additional columns in to show their counts per day and percentages as shown below.
attr |day1_count| day1_%| day2_count| day2_%
----------------|----------|-------|-----------|-------
abc |2 |66.6% | 0 | 0.0%
def |1 |33.3% | 1 | 50.0%
pqr |0 |0.0% | 1 | 50.0%
I'm able to display one count by using group by but unable to find out how to even seperate them to multiple columns. I tried to generate day1 percentage with
SELECT attr, count(attr), count(attr) / sum(sub.day1_count) * 100 as percentage from (
SELECT attr, count(*) as day1_count FROM my_table WHERE DATEPART(week, time) = DATEPART(day, GETDate()) GROUP BY attr) as sub
GROUP BY attr;
But this also is not giving me correct answer, I'm getting all zeroes for percentage and count as 1. Any help is appreciated. I'm trying to do this in Redshift which follows postgresql syntax.
Let's nail the logic before presenting:
with CTE1 as
(
select attr, DATEPART(day, time) as theday, count(*) as thecount
from MyTable
)
, CTE2 as
(
select theday, sum(thecount) as daytotal
from CTE1
group by theday
)
select t1.attr, t1.theday, t1.thecount, t1.thecount/t2.daytotal as percentofday
from CTE1 t1
inner join CTE2 t2
on t1.theday = t2.theday
From here you can pivot to create a day by day if you feel the need
I am trying to enhance the query #johnHC btw if you needs for 7days then you have to those days in case when
with CTE1 as
(
select attr, time::date as theday, count(*) as thecount
from t group by attr,time::date
)
, CTE2 as
(
select theday, sum(thecount) as daytotal
from CTE1
group by theday
)
,
CTE3 as
(
select t1.attr, EXTRACT(DOW FROM t1.theday) as day_nmbr,t1.theday, t1.thecount, t1.thecount/t2.daytotal as percentofday
from CTE1 t1
inner join CTE2 t2
on t1.theday = t2.theday
)
select CTE3.attr,
max(case when day_nmbr=0 then CTE3.thecount end) as day1Cnt,
max(case when day_nmbr=0 then percentofday end) as day1,
max(case when day_nmbr=1 then CTE3.thecount end) as day2Cnt,
max( case when day_nmbr=1 then percentofday end) day2
from CTE3 group by CTE3.attr
http://sqlfiddle.com/#!17/54ace/20
In case that you have only 2 days:
http://sqlfiddle.com/#!17/3bdad/3 (days descending as in your example from left to right)
http://sqlfiddle.com/#!17/3bdad/5 (days ascending)
The main idea is already mentioned in the other answers. Instead of joining the CTEs for calculating the values I am using window functions which is a bit shorter and more readable I think. The pivot is done the same way.
SELECT
attr,
COALESCE(max(count) FILTER (WHERE day_number = 0), 0) as day1_count, -- D
COALESCE(max(percent) FILTER (WHERE day_number = 0), 0) as day1_percent,
COALESCE(max(count) FILTER (WHERE day_number = 1), 0) as day2_count,
COALESCE(max(percent) FILTER (WHERE day_number = 1), 0) as day2_percent
/*
Add more days here
*/
FROM(
SELECT *, (count::float/count_per_day)::decimal(5, 2) as percent -- C
FROM (
SELECT DISTINCT
attr,
MAX(time::date) OVER () - time::date as day_number, -- B
count(*) OVER (partition by time::date, attr) as count, -- A
count(*) OVER (partition by time::date) as count_per_day
FROM test_table
)s
)s
GROUP BY attr
ORDER BY attr
A counting the rows per day and counting the rows per day AND attr
B for more readability I convert the date into numbers. Here I take the difference between current date of the row and the maximum date available in the table. So I get a counter from 0 (first day) up to n - 1 (last day)
C calculating the percentage and rounding
D pivot by filter the day numbers. The COALESCE avoids the NULL values and switched them into 0. To add more days you can multiply these columns.
Edit: Made the day counter more flexible for more days; new SQL Fiddle
Basically, I see this as conditional aggregation. But you need to get an enumerator for the date for the pivoting. So:
SELECT attr,
COUNT(*) FILTER (WHERE day_number = 1) as day1_count,
COUNT(*) FILTER (WHERE day_number = 1) / cnt as day1_percent,
COUNT(*) FILTER (WHERE day_number = 2) as day2_count,
COUNT(*) FILTER (WHERE day_number = 2) / cnt as day2_percent
FROM (SELECT attr,
DENSE_RANK() OVER (ORDER BY time::date DESC) as day_number,
1.0 * COUNT(*) OVER (PARTITION BY attr) as cnt
FROM test_table
) s
GROUP BY attr, cnt
ORDER BY attr;
Here is a SQL Fiddle.

Oracle SQL: Show entries from component tables once apiece

My objective is produce a dataset that shows a boatload of data from, in total, just shy of 50 tables, all in the same Oracle SQL database schema. Each table except the first consists of, as far as the report I'm building cares, two elements:
A foreign-key identifier that matches a row on the first table
A date
There may be many rows on one of these tables corresponding to one case, and it will NOT be the same number of rows from table to table.
My objective is to have each row in the first table show up as many times as needed to display all the results from the other tables once. So, something like this (except on a lot more tables):
CASE_FILE_ID INITIATED_DATE INSPECTION_DATE PAYMENT_DATE ACTION_DATE
------------ -------------- --------------- ------------ -----------
1000 10-JUL-1986 14-JUL-1987 10-JUL-1986
1000 14-JUL-1988 10-JUL-1987
1000 14-JUL-1989 10-JUL-1988
1000 10-JUL-1989
My current SQL code (shrunk down to five tables, but the rest all follow the same format as T1-T4):
SELECT DISTINCT
A.CASE_FILE_ID,
T1.DATE AS INITIATED_DATE,
T2.DATE AS INSPECTION_DATE,
T3.DATE AS PAYMENT_DATE,
T4.DATE AS ACTION_DATE
FROM
RECORDS.CASE_FILE A
LEFT OUTER JOIN RECORDS.INITIATE T1 ON A.CASE_FILE_ID = T1.CASE_FILE_ID
LEFT OUTER JOIN RECORDS.INSPECTION T2 ON A.CASE_FILE_ID = T2.CASE_FILE_ID
LEFT OUTER JOIN RECORDS.PAYMENT T3 ON A.CASE_FILE_ID = T3.CASE_FILE_ID
LEFT OUTER JOIN RECORDS.ACTION T4 ON A.CASE_FILE_ID = T4.CASE_FILE_ID
ORDER BY
A.CASE_FILE_ID
The problem is, the output this produces results in distinct combinations; so in the above example (where I added a 'WHERE' clause of A.CASE_FILE_ID = '1000'), instead of four rows for case 1000, it'd show twelve (1 Initiated Date * 3 Inspection Dates * 4 Payment Dates = 12 rows). Suffice it to say, as the number of tables increases, this would get very prohibitive in both display and runtime, very quickly.
What is the best way to get an output loosely akin to the ideal above, where any one date is only shown once? Failing that, is there a way to get it to only show as many lines for one CASE_FILE as it needs to show all the dates, even if some dates repeat within that?
There isn't a good way, but there are two ways. One method involves subqueries for each table and complex outer joins. The second involves subqueries and union all. Let's go with that one:
SELECT CASE_FILE_ID,
MAX(INITIATED_DATE) as INITIATED_DATE,
MAX(INSPECTION_DATE) as INSPECTION_DATE,
MAX(PAYMENT_DATE) as PAYMENT_DATE,
MAX(ACTION) as ACTION
FROM ((SELECT A.CASE_FILE_ID, NULL as INITIATED_DATE, NULL as INSPECTION_DATE,
NULL as PAYMENT_DATE, NULL as ACTION_DATE,
1 as seqnum
FROM RECORDS.CASE_FILE A
) UNION ALL
(SELECT T1.CASE_FILE_ID, DATE as INITIATED_DATE, NULL as INSPECTION_DATE,
NULL as PAYMENT_DATE, NULL as ACTION_DATE,
ROW_NUMBER() OVER (PARTITION BY CASE_FILE_ID ORDER BY DATE) as seqnum
FROM RECORDS.INITIATE
) UNION ALL
(SELECT T1.CASE_FILE_ID, NULL as INITIATED_DATE, DATE as INSPECTION_DATE,
NULL as PAYMENT_DATE, NULL as ACTION_DATE,
ROW_NUMBER() OVER (PARTITION BY CASE_FILE_ID ORDER BY DATE) as seqnum
FROM RECORDS.INSPECTION
) UNION ALL
(SELECT T1.CASE_FILE_ID, NULL as INITIATED_DATE, NULL as INSPECTION_DATE,
DATE as PAYMENT_DATE, NULL as ACTION_DATE,
ROW_NUMBER() OVER (PARTITION BY CASE_FILE_ID ORDER BY DATE) as seqnum
FROM RECORDS.PAYMENT
) UNION ALL
(SELECT T1.CASE_FILE_ID, NULL as INITIATED_DATE, NULL as INSPECTION_DATE,
NULL as PAYMENT_DATE, ACTION as ACTION_DATE,
ROW_NUMBER() OVER (PARTITION BY CASE_FILE_ID ORDER BY DATE) as seqnum
FROM RECORDS.ACTION
)
) a
GROUP BY CASE_FILE_ID, seqnum;
Hmmm, a closely related solution is easier to maintain:
SELECT CASE_FILE_ID,
MAX(CASE WHEN type = 'INITIATED' THEN DATE END) as INITIATED_DATE,
MAX(CASE WHEN type = 'INSPECTION' THEN DATE END) as INSPECTION_DATE,
MAX(CASE WHEN type = 'PAYMENT' THEN DATE END) as PAYMENT_DATE,
MAX(CASE WHEN type = 'ACTION' THEN DATE END) as ACTION
FROM ((SELECT A.CASE_FILE_ID, NULL as TYPE, NULL as DATE,
1 as seqnum
FROM RECORDS.CASE_FILE A
) UNION ALL
(SELECT T1.CASE_FILE_ID, 'INSPECTION', DATE,
ROW_NUMBER() OVER (PARTITION BY CASE_FILE_ID ORDER BY DATE) as seqnum
FROM RECORDS.INITIATE
) UNION ALL
(SELECT T1.CASE_FILE_ID, 'INSPECTION', DATE,
ROW_NUMBER() OVER (PARTITION BY CASE_FILE_ID ORDER BY DATE) as seqnum
FROM RECORDS.INSPECTION
) UNION ALL
(SELECT T1.CASE_FILE_ID, 'PAYMENT', DATE,
ROW_NUMBER() OVER (PARTITION BY CASE_FILE_ID ORDER BY DATE) as seqnum
FROM RECORDS.PAYMENT
) UNION ALL
(SELECT T1.CASE_FILE_ID, 'ACTION', DATE,
ROW_NUMBER() OVER (PARTITION BY CASE_FILE_ID ORDER BY DATE) as seqnum
FROM RECORDS.ACTION
)
) a
GROUP BY CASE_FILE_ID, seqnum;