SQL Server Select the most recent past date if no future date available - sql

I have a table structure as below,
CREATE TABLE #CustOrder ( CustId INT, OrderDate DATE )
INSERT #CustOrder ( CustId, OrderDate )
VALUES ( 1, '2016-11-01' ),
( 1, '2019-09-01' ),
( 2, '2019-07-01' ),
( 2, '2019-11-01' ),
( 3, '2017-01-01' ),
( 4, '2016-12-01' ),
( 4, '2017-01-01' )
I want to list the customer with their future order dates, if they do not have a future order I want to list their last or most recent order. I have the following query.
; WITH LastOrder AS
(
SELECT
CO.CustId,
CO.OrderDate,
ROW_NUMBER() OVER(PARTITION BY CO.CustId ORDER BY ABS(DATEDIFF(DAY, CO.OrderDate, GETUTCDATE()))) AS RowNum
FROM #CustOrder AS CO
)
SELECT LO.CustId, LO.OrderDate
FROM LastOrder AS LO
WHERE LO.RowNum = 1
This query gives me the result as,
CustId | OrderDate
--------+-------------
1 | 2016-11-01
2 | 2019-07-01
3 | 2017-01-01
4 | 2017-01-01
However, I need the result as,
CustId | OrderDate
--------+-------------
1 | 2019-09-01
2 | 2019-07-01
3 | 2017-01-01
4 | 2017-01-01
As
Customer 1 has a future order on 2019-09-01
Customer 2 has two future order but the first one is on 2019-07-01
Customer 3 has no more than 1 order, it should just return 2017-01-01
Customer 4 has two past orders but the most recent is 2017-01-01

rextester: http://rextester.com/PBKNA95127
CREATE TABLE #CustOrder ( CustId INT, OrderDate DATE )
INSERT #CustOrder ( CustId, OrderDate )
VALUES ( 1, '2016-11-01' ),
( 1, '2019-09-01' ),
( 2, '2019-07-01' ),
( 2, '2019-11-01' ),
( 3, '2017-01-01' ),
( 4, '2016-12-01' ),
( 4, '2017-01-01' )
; WITH LastOrder AS
(
SELECT
CO.CustId,
CO.OrderDate,
ROW_NUMBER() OVER(PARTITION BY CO.CustId
ORDER BY case when co.OrderDate > getdate() then 0 else 1 end
, abs(DATEDIFF(DAY, getdate(),CO.OrderDate)) asc
) AS RowNum
FROM #CustOrder AS CO
)
SELECT LO.CustId, LO.OrderDate
FROM LastOrder AS LO
WHERE LO.RowNum = 1
results:
+--------+------------+
| CustId | OrderDate |
+--------+------------+
| 1 | 2019-09-01 |
| 2 | 2019-07-01 |
| 3 | 2017-01-01 |
| 4 | 2017-01-01 |
+--------+------------+

You can use the MAX function to check if the latest date is in the future. If so, get the MIN date after today using MIN. Else get the latest date.
SELECT CUSTID,OrderDate
FROM (SELECT CustId,
OrderDate,
CASE WHEN MAX(orderdate) OVER(PARTITION BY CustId) > GETUTCDATE()
THEN MIN(case when orderdate >getutcdate() then orderdate end) OVER(PARTITION BY CustId)
ELSE MAX(orderdate) OVER(PARTITION BY CustId) end as latest_date
FROM #CustOrder) T
WHERE latest_date=orderDate

Min, Max, UNION approach
select custID, MIN(OrderDate)
from #CustOrder
where OrderDate > '2017-02-17'
group by custID
union all
select co1.custID, max(co1.OrderDate)
from #CustOrder co1
where not exists ( select 1
from #CustOrder co2
where co2.CustId = co1.CustId
and co2.OrderDate > '2017-02-17'
)
group by co1.custID

Start your ORDER BY with a CASE expression that prefers future over past, and then use the ABS DATEDIFF (like you have now) as the second condition in the ORDER BY.

Maybe create another column and use the LAG() window function to grab the last date function and then put a conditional/case statement within the select portion? https://msdn.microsoft.com/en-us/library/hh231256.aspx

Related

Aggregate a subtotal column based on two dates of that same row

Situation:
I have 5 columns
id
subtotal (price of item)
order_date (purchase date)
updated_at (if refunded or any other status change)
status
Objective:
I need the order date as column 1
I need to get the subtotal for each day regardless if of the status as column 2
I need the subtotal amount for refunds for the third column.
Example:
If a purchase is made on May 1st and refunded on May 3rd. The output should look like this
+-------+----------+--------+
| date | subtotal | refund |
+-------+----------+--------+
| 05-01 | 10.00 | 0.00 |
| 05-02 | 00.00 | 0.00 |
| 05-03 | 00.00 | 10.00 |
+-------+----------+--------+
while the row will look like that
+-----+----------+------------+------------+----------+
| id | subtotal | order_date | updated_at | status |
+-----+----------+------------+------------+----------+
| 123 | 10 | 2019-05-01 | 2019-05-03 | refunded |
+-----+----------+------------+------------+----------+
Query:
Currently what I have looks like this:
Note: Timezone discrepancy therefore bring back the dates by 8 hours.
;with cte as (
select id as orderid
, CAST(dateadd(hour,-8,order_date) as date) as order_date
, CAST(dateadd(hour,-8,updated_at) as date) as updated_at
, subtotal
, status
from orders
)
select
b.dates
, sum(a.subtotal_price) as subtotal
, -- not sure how to aggregate it to get the refunds
from Orders as o
inner join cte as a on orders.id=cte.orderid
inner join (select * from cte where status = ('refund')) as b on o.id=cte.orderid
where dates between '2019-05-01' and '2019-05-31'
group by dates
And do I need to join it twice? Hopefully not since my table is huge.
This looks like a job for a Calendar Table. Bit of a stab in the dark, but:
--Overly simplistic Calendar table
CREATE TABLE dbo.Calendar (CalendarDate date);
WITH N AS(
SELECT N
FROM (VALUES(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL))N(N)),
Tally AS(
SELECT ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) -1 AS I
FROM N N1, N N2, N N3, N N4, N N5) --Many years of data
INSERT INTO dbo.Calendar
SELECT DATEADD(DAY, T.I, 0)
FROM Tally T;
GO
SELECT C.CalendarDate AS [date],
CASE C.CalendarDate WHEN V.order_date THEN subtotal ELSE 0 END AS subtotal,
CASE WHEN C.CalendarDate = V.updated_at AND V.[status] = 'refunded' THEN subtotal ELSE 0.00 END AS subtotal
FROM (VALUES(123,10.00,CONVERT(date,'20190501'),CONVERT(date,'20190503'),'refunded'))V(id,subtotal,order_date,updated_at,status)
JOIN dbo.Calendar C ON V.order_date <= C.CalendarDate AND V.updated_at >= C.CalendarDate;
GO
DROP TABLE dbo.Calendar;
Consider joining on a recursive CTE of sequential dates:
WITH dates AS (
SELECT CONVERT(datetime, '2019-01-01') AS rec_date
UNION ALL
SELECT DATEADD(d, 1, CONVERT(datetime, rec_date))
FROM dates
WHERE rec_date < '2019-12-31'
),
cte AS (
SELECT id AS orderid
, CAST(dateadd(hour,-8,order_date) AS date) as order_date
, CAST(dateadd(hour,-8,updated_at) AS date) as updated_at
, subtotal
, status
FROM orders
)
SELECT rec_date AS date,
CASE
WHEN c.order_date = d.rec_date THEN subtotal
ELSE 0
END AS subtotal,
CASE
WHEN c.updated_at = d.rec_date THEN subtotal
ELSE 0
END AS refund
FROM cte c
JOIN dates d ON d.rec_date BETWEEN c.order_date AND c.updated_at
WHERE c.status = 'refund'
option (maxrecursion 0)
GO
Rextester demo

Is it possible to create counts by date on historic events table?

I have an events table which contains the date of status changes. What I'm trying to achieve is to produce summary counts for each date, however I'm struggling as it is not a straight count by date but instead a count based on the last time the status changed.
The data is as follows:
------------------------------------------
IT_ID NEW_STATUS OLD_STATUS TIMESTAMP
------------------------------------------
100 4 3 06/05/2019
100 3 2 04/05/2019
200 2 1 03/05/2019
100 2 1 02/05/2019
300 2 1 02/05/2019
200 1 - 01/05/2019
100 1 - 01/05/2019
300 1 - 01/05/2019
-------------------------------------------
I've tried grouping, but this hasn't worked due to the above, SQL below for the straight count.
select max(trunc(timestamp)), new_status ,count(new_status)
from status_hist
where trunc(timestamp) >= '01/01/2019'
group by trunc(timestamp), new_status
Ideally I would like the data in the following format, however the key here is to counts against each date. Note, as no status changes took place on the 05/05/19 then it shows the same of the 04/05/19:
---------------------------------------------------------
Date Status 1 Status 2 Status 3 Status 4
---------------------------------------------------------
06/05/2019 0 2 0 1
05/05/2019 0 2 1 0
04/05/2019 0 2 1 0
03/05/2019 0 3 0 0
02/05/2019 1 2 0 0
01/05/2019 3 0 0 0
--------------------------------------------------------
Any help would be gratefully received.
Thanks
I think about handling this problem by getting the status of each person on each date. That requires a cross join to get the person/dates combinations and then some aggregation:
WITH dates as (
SELECT min_dt + LEVEL - 1 AS dt
FROM (SELECT MIN(ts) AS min_dt, MAX(ts) AS max_dt
FROM test_data
)
CONNECT BY min_dt + LEVEL - 1 <= max_dt
)
SELECT d.dt, i.it_id, max(td.new_status) keep (dense_rank first order by td.ts desc) as status
FROM dates d CROSS JOIN
(SELECT DISTINCT IT_ID FROM test_data) i LEFT JOIN
test_data td
ON td.IT_ID = i.IT_ID AND td.ts <= d.dt
GROUP BY d.dt, i.it_id;
The dates CTE is just calculating all dates. The rest is bringing in the latest status.
This can then be expanded to aggregate (or pivot) the results:
WITH dates as (
SELECT min_dt + LEVEL - 1 AS dt
FROM (SELECT MIN(ts) AS min_dt, MAX(ts) AS max_dt
FROM test_data
)
CONNECT BY min_dt + LEVEL - 1 <= max_dt
),
di as (
SELECT d.dt, i.it_id, max(td.new_status) keep (dense_rank first order by td.ts desc) as status
FROM dates d CROSS JOIN
(SELECT DISTINCT IT_ID FROM test_data) i LEFT JOIN
test_data td
ON td.IT_ID = i.IT_ID AND td.ts <= d.dt
GROUP BY d.dt, i.it_id
)
select dt,
sum(case when status = 1 then 1 else 0 end) as num_1,
sum(case when status = 2 then 1 else 0 end) as num_2,
sum(case when status = 3 then 1 else 0 end) as num_3,
sum(case when status = 4 then 1 else 0 end) as num_4
from di
group by dt
order by dt desc;
Here is a db<>fiddle.
You can do it using windowed aggregation functions:
Oracle Setup:
CREATE TABLE test_data ( IT_ID, NEW_STATUS, OLD_STATUS, "TIMESTAMP" ) AS
SELECT 100, 4, 3, DATE '2019-05-06' FROM DUAL UNION ALL
SELECT 100, 3, 2, DATE '2019-05-04' FROM DUAL UNION ALL
SELECT 200, 2, 1, DATE '2019-05-03' FROM DUAL UNION ALL
SELECT 100, 2, 1, DATE '2019-05-02' FROM DUAL UNION ALL
SELECT 300, 2, 1, DATE '2019-05-02' FROM DUAL UNION ALL
SELECT 200, 1, NULL, DATE '2019-05-01' FROM DUAL UNION ALL
SELECT 100, 1, NULL, DATE '2019-05-01' FROM DUAL UNION ALL
SELECT 300, 1, NULL, DATE '2019-05-01' FROM DUAL;
Query:
SELECT DISTINCT
dt AS "TIMESTAMP",
COUNT( CASE new_status WHEN 1 THEN IT_ID END ) OVER ( ORDER BY dt RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW )
- COUNT( CASE old_status WHEN 1 THEN IT_ID END ) OVER ( ORDER BY dt RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW )
AS Status1,
COUNT( CASE new_status WHEN 2 THEN IT_ID END ) OVER ( ORDER BY dt RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW )
- COUNT( CASE old_status WHEN 2 THEN IT_ID END ) OVER ( ORDER BY dt RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW )
AS Status2,
COUNT( CASE new_status WHEN 3 THEN IT_ID END ) OVER ( ORDER BY dt RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW )
- COUNT( CASE old_status WHEN 3 THEN IT_ID END ) OVER ( ORDER BY dt RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW )
AS Status3,
COUNT( CASE new_status WHEN 4 THEN IT_ID END ) OVER ( ORDER BY dt RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW )
- COUNT( CASE old_status WHEN 4 THEN IT_ID END ) OVER ( ORDER BY dt RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW )
AS Status4
FROM test_data t
RIGHT OUTER JOIN (
SELECT min_dt + LEVEL - 1 AS dt
FROM ( SELECT MIN("TIMESTAMP") AS min_dt,
MAX("TIMESTAMP") AS max_dt
FROM test_data
)
CONNECT BY min_dt + LEVEL - 1 <= max_dt
) c
ON ( c.dt = t."TIMESTAMP" )
ORDER BY "TIMESTAMP" DESC
Output:
TIMESTAMP | STATUS1 | STATUS2 | STATUS3 | STATUS4
:-------- | ------: | ------: | ------: | ------:
06-MAY-19 | 0 | 2 | 0 | 1
05-MAY-19 | 0 | 2 | 1 | 0
04-MAY-19 | 0 | 2 | 1 | 0
03-MAY-19 | 0 | 3 | 0 | 0
02-MAY-19 | 1 | 2 | 0 | 0
01-MAY-19 | 3 | 0 | 0 | 0
db<>fiddle here
You can use the pivot function of SQL.
I don't have an oracle DB to test this:
declare #dates table(Date timestamp(3), NEW_STATUS number(10))
v_StartDate DATE := (SELECT MIN(timestamp) FROM [test].dbo)
v_EndDate DATE := (SELECT MAX(timestamp) FROM [test].dbo)
insert into #dates
SELECT nbr * INTERVAL '1' DAY(5) - 1 + v_StartDate as 'Date', null as NEW_STATUS
FROM ( SELECT ROW_NUMBER() OVER ( ORDER BY c.object_id ) AS Nbr
FROM sys.columns c
) nbrs
WHERE nbr - 1 <= v_EndDate - v_StartDate
SELECT timestamp as 'Date', 1 AS 'Status 1', 2 AS 'Status 2', 3 AS 'Status 3', 4 AS 'Status 4'
FROM
(SELECT Date as 'timestamp', NVL(NVL(d.new_status, t.NEW_STATUS),t2.NEW_STATUS) as new_status
FROM #dates d
left outer join Table_test t on d.Date = t.TIMESTAMP
left outer join Table_test t2 on INTERVAL '-1' DAY(5) +d.Date = t2.TIMESTAMP and NVL(d.new_status, t.NEW_STATUS) is null ) p
PIVOT
(
COUNT (new_status)
FOR new_status IN
( 1, 2, 3, 4 )
) AS pvt
ORDER BY pvt.TIMESTAMP desc
My Microsoft SQL Syntax is:
declare #dates table([Date] datetime, [NEW_STATUS] int)
DECLARE #StartDate DATE = (SELECT MIN(timestamp) FROM [test].[dbo].[Table_test])
DECLARE #EndDate DATE = (SELECT MAX(timestamp) FROM [test].[dbo].[Table_test])
insert into #dates
SELECT DATEADD(DAY, nbr - 1, #StartDate) as 'Date', null as NEW_STATUS
FROM ( SELECT ROW_NUMBER() OVER ( ORDER BY c.object_id ) AS Nbr
FROM sys.columns c
) nbrs
WHERE nbr - 1 <= DATEDIFF(DAY, #StartDate, #EndDate)
SELECT timestamp as 'Date', [1] AS 'Status 1', [2] AS 'Status 2', [3] AS 'Status 3', [4] AS 'Status 4'
FROM
(SELECT Date as 'timestamp', ISNULL(ISNULL(d.new_status, t.NEW_STATUS),t2.NEW_STATUS) as new_status
FROM #dates d
left outer join Table_test t on d.Date = t.TIMESTAMP
left outer join Table_test t2 on DATEADD(DAY,-1,d.Date) = t2.TIMESTAMP and ISNULL(d.new_status, t.NEW_STATUS) is null ) p
PIVOT
(
COUNT (new_status)
FOR new_status IN
( [1], [2], [3], [4] )
) AS pvt
ORDER BY pvt.TIMESTAMP desc

Query for negative account balance period in bigquery

I am playing around with bigquery and hit an interesting use case. I have a collection of customers and account balances. The account balances collection records any account balance change.
Customers:
+---------+--------+
| ID | Name |
+---------+--------+
| 1 | Alice |
| 2 | Bob |
+---------+--------+
Accounts balances:
+---------+---------------+---------+------------+
| ID | customer_id | value | timestamp |
+---------+---------------+---------+------------+
| 1 | 1 | -500 | 2019-02-12 |
| 2 | 1 | -200 | 2019-02-10 |
| 3 | 2 | 200 | 2019-02-10 |
| 4 | 1 | 0 | 2019-02-09 |
+---------+---------------+---------+------------+
The goal is to find out, for how long a customer has a negative account balance. The resulting collection would look like this:
+---------+--------+---------------------------------+
| ID | Name | Negative account balance since |
+---------+--------+---------------------------------+
| 1 | Alice | 2 days |
+---------+--------+---------------------------------+
Bob is not in the collection, because his last account record shows a positive value.
I think following steps are involved:
get last account balance per customer, see if it is negative
go through the account balance values until you hit a positive (or no more) value
compute datediff
Is something like this even possible in sql? Do you have any ideas on who to create such query? To get customers that currently have a negative account balance, I use this query:
SELECT customer_id FROM (
SELECT t.account_balance, ROW_NUMBER() OVER (PARTITION BY customer_id ORDER BY timestamp DESC) as seqnum FROM `account_balances` t
) t
WHERE seqnum = 1 AND account_balance<0
Below is for BigQuery Standard SQL
#standardSQL
SELECT customer_id, name,
SUM(IF(negative_positive < 0, days, 0)) negative_days,
SUM(IF(negative_positive = 0, days, 0)) zero_days,
SUM(IF(negative_positive > 0, days, 0)) positive_days
FROM (
SELECT customer_id, negative_positive, grp,
1 + DATE_DIFF(MAX(ts), MIN(ts), DAY) days
FROM (
SELECT customer_id, ts, SIGN(value) negative_positive,
COUNTIF(flag) OVER(PARTITION BY customer_id ORDER BY ts) grp
FROM (
SELECT *, SIGN(value) = IFNULL(LEAD(SIGN(value)) OVER(PARTITION BY customer_id ORDER BY ts), 0) flag
FROM `project.dataset.balances`
)
)
GROUP BY customer_id, negative_positive, grp
)
LEFT JOIN `project.dataset.customers`
ON id = customer_id
GROUP BY customer_id, name
You can test, play with above using sample data from your question as in below example
#standardSQL
WITH `project.dataset.balances` AS (
SELECT 1 customer_id, -500 value, DATE '2019-02-12' ts UNION ALL
SELECT 1, -200, '2019-02-10' UNION ALL
SELECT 2, 200, '2019-02-10' UNION ALL
SELECT 1, 0, '2019-02-09'
), `project.dataset.customers` AS (
SELECT 1 id, 'Alice' name UNION ALL
SELECT 2, 'Bob'
)
SELECT customer_id, name,
SUM(IF(negative_positive < 0, days, 0)) negative_days,
SUM(IF(negative_positive = 0, days, 0)) zero_days,
SUM(IF(negative_positive > 0, days, 0)) positive_days
FROM (
SELECT customer_id, negative_positive, grp,
1 + DATE_DIFF(MAX(ts), MIN(ts), DAY) days
FROM (
SELECT customer_id, ts, SIGN(value) negative_positive,
COUNTIF(flag) OVER(PARTITION BY customer_id ORDER BY ts) grp
FROM (
SELECT *, SIGN(value) = IFNULL(LEAD(SIGN(value)) OVER(PARTITION BY customer_id ORDER BY ts), 0) flag
FROM `project.dataset.balances`
)
)
GROUP BY customer_id, negative_positive, grp
)
LEFT JOIN `project.dataset.customers`
ON id = customer_id
GROUP BY customer_id, name
-- ORDER BY customer_id
with result
Row customer_id name negative_days zero_days positive_days
1 1 Alice 3 1 0
2 2 Bob 0 0 1

SQL - Find if column dates include at least partially a date range

I need to create a report and I am struggling with the SQL script.
The table I want to query is a company_status_history table which has entries like the following (the ones that I can't figure out)
Table company_status_history
Columns:
| id | company_id | status_id | effective_date |
Data:
| 1 | 10 | 1 | 2016-12-30 00:00:00.000 |
| 2 | 10 | 5 | 2017-02-04 00:00:00.000 |
| 3 | 11 | 5 | 2017-06-05 00:00:00.000 |
| 4 | 11 | 1 | 2018-04-30 00:00:00.000 |
I want to answer to the question "Get all companies that have been at least for some point in status 1 inside the time period 01/01/2017 - 31/12/2017"
Above are the cases that I don't know how to handle since I need to add some logic of type :
"If this row is status 1 and it's date is before the date range check the next row if it has a date inside the date range."
"If this row is status 1 and it's date is after the date range check the row before if it has a date inside the date range."
I think this can be handled as a gaps and islands problem. Consider the following input data: (same as sample data of OP plus two additional rows)
id company_id status_id effective_date
-------------------------------------------
1 10 1 2016-12-15
2 10 1 2016-12-30
3 10 5 2017-02-04
4 10 4 2017-02-08
5 11 5 2017-06-05
6 11 1 2018-04-30
You can use the following query:
SELECT t.id, t.company_id, t.status_id, t.effective_date, x.cnt
FROM company_status_history AS t
OUTER APPLY
(
SELECT COUNT(*) AS cnt
FROM company_status_history AS c
WHERE c.status_id = 1
AND c.company_id = t.company_id
AND c.effective_date < t.effective_date
) AS x
ORDER BY company_id, effective_date
to get:
id company_id status_id effective_date grp
-----------------------------------------------
1 10 1 2016-12-15 0
2 10 1 2016-12-30 1
3 10 5 2017-02-04 2
4 10 4 2017-02-08 2
5 11 5 2017-06-05 0
6 11 1 2018-04-30 0
Now you can identify status = 1 islands using:
;WITH CTE AS
(
SELECT t.id, t.company_id, t.status_id, t.effective_date, x.cnt
FROM company_status_history AS t
OUTER APPLY
(
SELECT COUNT(*) AS cnt
FROM company_status_history AS c
WHERE c.status_id = 1
AND c.company_id = t.company_id
AND c.effective_date < t.effective_date
) AS x
)
SELECT id, company_id, status_id, effective_date,
ROW_NUMBER() OVER (PARTITION BY company_id ORDER BY effective_date) -
cnt AS grp
FROM CTE
Output:
id company_id status_id effective_date grp
-----------------------------------------------
1 10 1 2016-12-15 1
2 10 1 2016-12-30 1
3 10 5 2017-02-04 1
4 10 4 2017-02-08 2
5 11 5 2017-06-05 1
6 11 1 2018-04-30 2
Calculated field grp will help us identify those islands:
;WITH CTE AS
(
SELECT t.id, t.company_id, t.status_id, t.effective_date, x.cnt
FROM company_status_history AS t
OUTER APPLY
(
SELECT COUNT(*) AS cnt
FROM company_status_history AS c
WHERE c.status_id = 1
AND c.company_id = t.company_id
AND c.effective_date < t.effective_date
) AS x
), CTE2 AS
(
SELECT id, company_id, status_id, effective_date,
ROW_NUMBER() OVER (PARTITION BY company_id ORDER BY effective_date) -
cnt AS grp
FROM CTE
)
SELECT company_id,
MIN(effective_date) AS start_date,
CASE
WHEN COUNT(*) > 1 THEN DATEADD(DAY, -1, MAX(effective_date))
ELSE MIN(effective_date)
END AS end_date
FROM CTE2
GROUP BY company_id, grp
HAVING COUNT(CASE WHEN status_id = 1 THEN 1 END) > 0
Output:
company_id start_date end_date
-----------------------------------
10 2016-12-15 2017-02-03
11 2018-04-30 2018-04-30
All you want know is those records from above that overlap with the specified interval.
Demo here with somewhat more complicated use case.
Maybe this is what you are looking for? For these kind of questions, you need to join two instance of your table, in this case I am just joining with next record by Id, which probably is not totally correct. To do it better, you can create a new Id using a windowed function like row_number, ordering the table by your requirement criteria
If this row is status 1 and it's date is before the date range check
the next row if it has a date inside the date range
declare #range_st date = '2017-01-01'
declare #range_en date = '2017-12-31'
select
case
when csh1.status_id=1 and csh1.effective_date<#range_st
then
case
when csh2.effective_date between #range_st and #range_en then true
else false
end
else NULL
end
from company_status_history csh1
left join company_status_history csh2
on csh1.id=csh2.id+1
Implementing second criteria:
"If this row is status 1 and it's date is after the date range check
the row before if it has a date inside the date range."
declare #range_st date = '2017-01-01'
declare #range_en date = '2017-12-31'
select
case
when csh1.status_id=1 and csh1.effective_date<#range_st
then
case
when csh2.effective_date between #range_st and #range_en then true
else false
end
when csh1.status_id=1 and csh1.effective_date>#range_en
then
case
when csh3.effective_date between #range_st and #range_en then true
else false
end
else null -- ¿?
end
from company_status_history csh1
left join company_status_history csh2
on csh1.id=csh2.id+1
left join company_status_history csh3
on csh1.id=csh3.id-1
I would suggest the use of a cte and the window functions ROW_NUMBER. With this you can find the desired records. An example:
DECLARE #t TABLE(
id INT
,company_id INT
,status_id INT
,effective_date DATETIME
)
INSERT INTO #t VALUES
(1, 10, 1, '2016-12-30 00:00:00.000')
,(2, 10, 5, '2017-02-04 00:00:00.000')
,(3, 11, 5, '2017-06-05 00:00:00.000')
,(4, 11, 1, '2018-04-30 00:00:00.000')
DECLARE #StartDate DATETIME = '2017-01-01';
DECLARE #EndDate DATETIME = '2017-12-31';
WITH cte AS(
SELECT *
,ROW_NUMBER() OVER (PARTITION BY company_id ORDER BY effective_date) AS rn
FROM #t
),
cteLeadLag AS(
SELECT c.*, ISNULL(c2.effective_date, c.effective_date) LagEffective, ISNULL(c3.effective_date, c.effective_date)LeadEffective
FROM cte c
LEFT JOIN cte c2 ON c2.company_id = c.company_id AND c2.rn = c.rn-1
LEFT JOIN cte c3 ON c3.company_id = c.company_id AND c3.rn = c.rn+1
)
SELECT 'Included' AS RangeStatus, *
FROM cteLeadLag
WHERE status_id = 1
AND effective_date BETWEEN #StartDate AND #EndDate
UNION ALL
SELECT 'Following' AS RangeStatus, *
FROM cteLeadLag
WHERE status_id = 1
AND effective_date > #EndDate
AND LagEffective BETWEEN #StartDate AND #EndDate
UNION ALL
SELECT 'Trailing' AS RangeStatus, *
FROM cteLeadLag
WHERE status_id = 1
AND effective_date < #EndDate
AND LeadEffective BETWEEN #StartDate AND #EndDate
I first select all records with their leading and lagging Dates and then I perform your checks on the inclusion in the desired timespan.
Try with this, self-explanatory. Responds to this part of your question:
I want to answer to the question "Get all companies that have been at
least for some point in status 1 inside the time period 01/01/2017 -
31/12/2017"
Case that you want to find those id's that have been in any moment in status 1 and have records in the period requested:
SELECT *
FROM company_status_history
WHERE id IN
( SELECT Id
FROM company_status_history
WHERE status_id=1 )
AND effective_date BETWEEN '2017-01-01' AND '2017-12-31'
Case that you want to find id's in status 1 and inside the period:
SELECT *
FROM company_status_history
WHERE status_id=1
AND effective_date BETWEEN '2017-01-01' AND '2017-12-31'

Select distinct users group by time range

I have a table with the following info
|date | user_id | week_beg | month_beg|
SQL to create table with test values:
CREATE TABLE uniques
(
date DATE,
user_id INT,
week_beg DATE,
month_beg DATE
)
INSERT INTO uniques VALUES ('2013-01-01', 1, '2012-12-30', '2013-01-01')
INSERT INTO uniques VALUES ('2013-01-03', 3, '2012-12-30', '2013-01-01')
INSERT INTO uniques VALUES ('2013-01-06', 4, '2013-01-06', '2013-01-01')
INSERT INTO uniques VALUES ('2013-01-07', 4, '2013-01-06', '2013-01-01')
INPUT TABLE:
| date | user_id | week_beg | month_beg |
| 2013-01-01 | 1 | 2012-12-30 | 2013-01-01 |
| 2013-01-03 | 3 | 2012-12-30 | 2013-01-01 |
| 2013-01-06 | 4 | 2013-01-06 | 2013-01-01 |
| 2013-01-07 | 4 | 2013-01-06 | 2013-01-01 |
OUTPUT TABLE:
| date | time_series | cnt |
| 2013-01-01 | D | 1 |
| 2013-01-01 | W | 1 |
| 2013-01-01 | M | 1 |
| 2013-01-03 | D | 1 |
| 2013-01-03 | W | 2 |
| 2013-01-03 | M | 2 |
| 2013-01-06 | D | 1 |
| 2013-01-06 | W | 1 |
| 2013-01-06 | M | 3 |
| 2013-01-07 | D | 1 |
| 2013-01-07 | W | 1 |
| 2013-01-07 | M | 3 |
I want to calculate the number of distinct user_id's for a date:
For that date
For that week up to that date (Week to date)
For the month up to that date (Month to date)
1 is easy to calculate.
For 2 and 3 I am trying to use such queries:
SELECT
date,
'W' AS "time_series",
(COUNT DISTINCT user_id) COUNT (user_id) OVER (PARTITION BY week_beg) AS "cnt"
FROM user_subtitles
SELECT
date,
'M' AS "time_series",
(COUNT DISTINCT user_id) COUNT (user_id) OVER (PARTITION BY month_beg) AS "cnt"
FROM user_subtitles
Postgres does not allow window functions for DISTINCT calculation, so this approach does not work.
I have also tried out a GROUP BY approach, but it does not work as it gives me numbers for whole week/months.
Whats the best way to approach this problem?
Count all rows
SELECT date, '1_D' AS time_series, count(DISTINCT user_id) AS cnt
FROM uniques
GROUP BY 1
UNION ALL
SELECT DISTINCT ON (1)
date, '2_W', count(*) OVER (PARTITION BY week_beg ORDER BY date)
FROM uniques
UNION ALL
SELECT DISTINCT ON (1)
date, '3_M', count(*) OVER (PARTITION BY month_beg ORDER BY date)
FROM uniques
ORDER BY 1, time_series
Your columns week_beg and month_beg are 100 % redundant and can easily be replaced by
date_trunc('week', date + 1) - 1 and date_trunc('month', date) respectively.
Your week seems to start on Sunday (off by one), therefore the + 1 .. - 1.
The default frame of a window function with ORDER BY in the OVER clause uses is RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW. That's exactly what you need.
Use UNION ALL, not UNION.
Your unfortunate choice for time_series (D, W, M) does not sort well, I renamed to make the final ORDER BY easier.
This query can deal with multiple rows per day. Counts include all peers for a day.
More about DISTINCT ON:
Select first row in each GROUP BY group?
DISTINCT users per day
To count every user only once per day, use a CTE with DISTINCT ON:
WITH x AS (SELECT DISTINCT ON (1,2) date, user_id FROM uniques)
SELECT date, '1_D' AS time_series, count(user_id) AS cnt
FROM x
GROUP BY 1
UNION ALL
SELECT DISTINCT ON (1)
date, '2_W'
,count(*) OVER (PARTITION BY (date_trunc('week', date + 1)::date - 1)
ORDER BY date)
FROM x
UNION ALL
SELECT DISTINCT ON (1)
date, '3_M'
,count(*) OVER (PARTITION BY date_trunc('month', date) ORDER BY date)
FROM x
ORDER BY 1, 2
DISTINCT users over dynamic period of time
You can always resort to correlated subqueries. Tend to be slow with big tables!
Building on the previous queries:
WITH du AS (SELECT date, user_id FROM uniques GROUP BY 1,2)
,d AS (
SELECT date
,(date_trunc('week', date + 1)::date - 1) AS week_beg
,date_trunc('month', date)::date AS month_beg
FROM uniques
GROUP BY 1
)
SELECT date, '1_D' AS time_series, count(user_id) AS cnt
FROM du
GROUP BY 1
UNION ALL
SELECT date, '2_W', (SELECT count(DISTINCT user_id) FROM du
WHERE du.date BETWEEN d.week_beg AND d.date )
FROM d
GROUP BY date, week_beg
UNION ALL
SELECT date, '3_M', (SELECT count(DISTINCT user_id) FROM du
WHERE du.date BETWEEN d.month_beg AND d.date)
FROM d
GROUP BY date, month_beg
ORDER BY 1,2;
SQL Fiddle for all three solutions.
Faster with dense_rank()
#Clodoaldo came up with a major improvement: use the window function dense_rank(). Here is another idea for an optimized version. It should be even faster to exclude daily duplicates right away. The performance gain grows with the number of rows per day.
Building on a simplified and sanitized data model
- without the redundant columns
- day as column name instead of date
date is a reserved word in standard SQL and a basic type name in PostgreSQL and shouldn't be used as identifier.
CREATE TABLE uniques(
day date -- instead of "date"
,user_id int
);
Improved query:
WITH du AS (
SELECT DISTINCT ON (1, 2)
day, user_id
,date_trunc('week', day + 1)::date - 1 AS week_beg
,date_trunc('month', day)::date AS month_beg
FROM uniques
)
SELECT day, count(user_id) AS d, max(w) AS w, max(m) AS m
FROM (
SELECT user_id, day
,dense_rank() OVER(PARTITION BY week_beg ORDER BY user_id) AS w
,dense_rank() OVER(PARTITION BY month_beg ORDER BY user_id) AS m
FROM du
) s
GROUP BY day
ORDER BY day;
SQL Fiddle demonstrating the performance of 4 faster variants. It depends on your data distribution which is fastest for you.
All of them are about 10x as fast as the correlated subqueries version (which isn't bad for correlated subqueries).
Without correlated subqueries. SQL Fiddle
with u as (
select
"date", user_id,
date_trunc('week', "date" + 1)::date - 1 week_beg,
date_trunc('month', "date")::date month_beg
from uniques
)
select
"date", count(distinct user_id) D,
max(week_dr) W, max(month_dr) M
from (
select
user_id, "date",
dense_rank() over(partition by week_beg order by user_id) week_dr,
dense_rank() over(partition by month_beg order by user_id) month_dr
from u
) s
group by "date"
order by "date"
Try
SELECT
*
FROM
(
SELECT dates, count(user_id), 'D' as timesereis FROM users_data GROUP BY dates
UNION
SELECT max(dates), count(user_id), 'W' FROM users_data GROUP BY date_part('year',dates)+date_part('week',dates)
UNION
SELECT max(dates), count(user_id), 'M' FROM users_data GROUP BY date_part('year',dates)+date_part('week',dates)
) tEMP order by dates, timesereis
SQLFIDDLE
Try queries like this
SELECT count(distinct user_id), date_format(date, '%Y-%m-%d') as date_period
FROM uniques
GROUP By date_period