Copy Most Recent Date's row values where gaps in dates exist - sql

I am creating a report in Tableau for a new product that captures metrics such as previous applications pending, new apps end of day pending etc. In order to do this, I need a a snapshot of the end of day status for each application each day. A decision was made above my pay grade to only capture a rolling seven day delta of the data. So, what happens is an application that has not had a status change in the previous seven days stops appearing in the DB until something new happens which allows for gaps in dates and throws my numbers off in my report. What I need is a snapshot for each day for each application, so when there is a date gap, I want to grab the most recent previous day's record and insert to fill in the gaps between the two dates. Also, I join to a credit score table and we sometimes pull all three bureaus, sometimes two, sometimes one so there could be up to three rows per application per day.
I have looked on this site for similar issues which I seem some similar issues however none are an exact match to what I am trying to accomplish and I honestly do not know where to start. Will a correlated subquery accomplish what I need? I provided some code below to show what the data looks like currently.
drop table if exists #date
drop table if exists #test
create table #date
(
calendar_date date
)
insert into #date
values
('2019-08-07'),
('2019-08-08'),
('2019-08-09'),
('2019-08-10'),
('2019-08-11'),
('2019-08-12')
create table #test
(
id int,
period_date date,
decision_status varchar(20),
credit_score int,
expired_flag bit
)
insert into #test (id,period_date,decision_status,credit_score,expired_flag)
values
(1,'2019-08-08','declined',635,null),
(1,'2019-08-08','declined',642,null),
(1,'2019-08-09','declined',635,null),
(1,'2019-08-09','declined',642,null),
(1,'2019-08-10','declined',635,null),
(1,'2019-08-10','declined',642,null),
(1,'2019-08-11','declined',635,null),
(1,'2019-08-11','declined',642,null),
(1,'2019-08-12','declined',635,null),
(1,'2019-08-12','declined',642,null),
(2,'2019-08-08','review',656,null),
(2,'2019-08-08','review',648,null),
(2,'2019-08-09','review',656,null),
(2,'2019-08-09','review',648,null),
(2,'2019-08-12','review',656,null),
(2,'2019-08-12','review',648,null),
(3,'2019-08-08','preapproved',678,null),
(3,'2019-08-08','preapproved',689,null),
(3,'2019-08-08','preapproved',693,null),
(3,'2019-08-09','preapproved',678,null),
(3,'2019-08-09','preapproved',689,null),
(3,'2019-08-09','preapproved',693,null),
(3,'2019-08-11','preapproved',678,1),
(3,'2019-08-11','preapproved',689,1),
(3,'2019-08-11','preapproved',693,1),
(3,'2019-08-12','preapproved',678,1),
(3,'2019-08-12','preapproved',689,1),
(3,'2019-08-12','preapproved',693,1),
(4,'2019-08-08','onboarded',725,null),
(4,'2019-08-09','onboarded',725,null),
(4,'2019-08-10','onboarded',725,null),
(5,'2019-08-08','approved',685,null),
(5,'2019-08-08','approved',675,null),
(5,'2019-08-09','approved',685,null),
(5,'2019-08-09','approved',675,null),
(5,'2019-08-12','approved',685,1),
(5,'2019-08-12','approved',675,1)
And the query:
select id, calendar_date, period_date, decision_status, credit_score, expired_flag
from #date join
#test
on calendar_date=dateadd(day,-1,period_date)
order by id, calendar_date
I just need each application to show for each day.

You may just need a left join: just need a left join:
select t.id, d.calendar_date, t.period_date, t.decision_status, t.credit_score, t.expired_flag
from #date d left join
#test t
on d.calendar_date = dateadd(day, -1, t.period_date)
order by id, d.calendar_date;
If by "application" you mean the id in #test, then use cross join to generate the rows and a outer apply to fill in the values:
select t.id, d.calendar_date, t.period_date, t.decision_status, t.credit_score, t.expired_flag
from #date d cross join
(select distinct id from #test) i outer apply
(select top (1) t.*
from #test t
where t.id = i.id and t.date <= d.date
order by t.date desc
) t

Update:
After receiving the reply from Gordon, which gave me some inspiration and set me in the right direction, and conducting some additional research, I appear to have found a solution that is working. I wanted to share the solution here in case anyone else runs across this problem. I am posting the code below:
drop table if exists #date
drop table if exists #test
drop table if exists #test1
drop table if exists #row_num
create table #date
(
calendar_date date
)
insert into #date
values
('2019-08-07'),
('2019-08-08'),
('2019-08-09'),
('2019-08-10'),
('2019-08-11')
create table #test
(
id int,
period_date date,
decision_status varchar(20),
credit_score int,
expired_flag bit
)
insert into #test (id,period_date,decision_status,credit_score,expired_flag)
values
(1,'2019-08-08','declined',635,null),
(1,'2019-08-08','declined',642,null),
(1,'2019-08-09','declined',635,null),
(1,'2019-08-09','declined',642,null),
(1,'2019-08-10','declined',635,null),
(1,'2019-08-10','declined',642,null),
(1,'2019-08-11','declined',635,null),
(1,'2019-08-11','declined',642,null),
(1,'2019-08-12','declined',635,null),
(1,'2019-08-12','declined',642,null),
(2,'2019-08-08','review',656,null),
(2,'2019-08-08','review',648,null),
(2,'2019-08-09','review',656,null),
(2,'2019-08-09','review',648,null),
(2,'2019-08-12','review',656,null),
(2,'2019-08-12','review',648,null),
(3,'2019-08-08','preapproved',678,null),
(3,'2019-08-08','preapproved',689,null),
(3,'2019-08-08','preapproved',693,null),
(3,'2019-08-09','preapproved',678,null),
(3,'2019-08-09','preapproved',689,null),
(3,'2019-08-09','preapproved',693,null),
(3,'2019-08-11','preapproved',678,1),
(3,'2019-08-11','preapproved',689,1),
(3,'2019-08-11','preapproved',693,1),
(3,'2019-08-12','preapproved',678,1),
(3,'2019-08-12','preapproved',689,1),
(3,'2019-08-12','preapproved',693,1),
(4,'2019-08-08','onboarded',725,null),
(4,'2019-08-09','onboarded',725,null),
(4,'2019-08-10','onboarded',725,null),
(5,'2019-08-08','approved',685,null),
(5,'2019-08-08','approved',675,null),
(5,'2019-08-09','approved',685,null),
(5,'2019-08-09','approved',675,null),
(5,'2019-08-12','approved',685,1),
(5,'2019-08-12','approved',675,1)
select id,calendar_date,decision_status,credit_score,expired_flag
,ROW_NUMBER() over(partition by id,calendar_date order by calendar_date) as row_id
,cast(ROW_NUMBER() over(partition by id,calendar_date order by calendar_date) as char(1)) as row_num
into #test1
from #date
join #test
on calendar_date=dateadd(day,-1,period_date)
order by id,calendar_date
create table #row_num
(
row_id int,
row_num char(1)
)
insert into #row_num
values
(1,'1'),
(2,'2'),
(3,'3')
select i.id
,d.calendar_date
,coalesce(t.decision_status,t1.decision_status) as decision_status
,coalesce(t.credit_score,t1.credit_score) as credit_score
,coalesce(t.expired_flag,t1.expired_flag) as expired_flag
from #date d
cross join
(select distinct id
from #test1 ) i
cross join #row_num r
left join #test1 t
on t.id=i.id
and t.row_id=r.row_id
and t.calendar_date=d.calendar_date
join
(select id,row_id,decision_status,credit_score,expired_flag
,calendar_date as start_date
,lead(calendar_date,1,dateadd(day,1,(select max(calendar_date) from #date)))
over (partition by id,row_id order by calendar_date) as end_date
from #test1
) t1
on t1.id=i.id
and t1.row_id=r.row_id
and d.calendar_date>=t1.start_date
and d.calendar_date<t1.end_date
order by i.id,d.calendar_date,r.row_id
This gives me what I am looking for, all the daily records for each application for each day.

Related

how to show records both the tables side by side

I have App table and Apphistory table i need to show app records and app history record side by side columns
declare #app table (Appno varchar(10),Name varchar(10),Height INT,weight INT,Createddate datetime)
insert into #app (Appno,Name,Height,weight,Createddate)values
('app1035','tom',10,60,'2015-07-02 20:14:45.590'),
('app1036','john',8,40,'2015-07-02 20:14:45.590'),
('app1037','jim',9,36,'2015-07-02 20:14:45.590')
declare #apphistory table
(
Appno varchar(10),
Name varchar(10),
Height INT,
weight INT,
Createddate datetime)
insert into #apphistory (Appno,Name,Height,weight,Createddate)
values('app1035','tom',10,60,'2015-07-02 20:14:45.590')
,('app1035','tom',8,45,'2015-06-02 20:14:45.590'),
('app1035','tom',6,NULL,'2015-05-02 20:14:45.590'),
('app1036','john',8,40,'2015-07-02 20:14:45.590')
,('app1036','john',8,40,'2015-06-02 20:14:45.590'),
('app1036','john',NULL,NULL,'2015-05-02 20:14:45.590')
select A.Appno, COALESCE(H.Appno,A.Appno)HAppno,
A.Name,COALESCE(H.Name,A.Name)Hname,
A.Height,COALESCE(H.Height,A.Height)Hheight,
A.weight,COALESCE(H.weight,A.weight)Hweight,
A.Createddate,COALESCE(H.Createddate,A.Createddate)Hcreateddate
FROM #app A LEFT JOIN (select top 1 Appno,Name,Height,weight,Createddate from #apphistory ORDER BY Createddate )H
ON A.Appno = H.Appno
WHERE A.Appno = 'app1036'
but what my problem is when there is Appno in App table and not there in App history table i will show record from App table.
when there is record in both tables i need to show old record of same app no with the values from Apphistory table
out put should be like :
Appno HAppno Name Hname Height Hheight weight Hweight Createddate Hcreateddate
app1036 app1036 john john 8 NULL 40 NULL 2015-07-02 20:14:45.590 2015-07-02 20:14:45.590
Use ROW_NUMBER() function to extract oldest record by created date. Fiddle sample
;WITH CTE AS
(
SELECT *, ROW_NUMBER() OVER (PARTITION BY Appno ORDER BY CreatedDate) rn
FROM #apphistory
)
SELECT a.*, h.*
FROM #app a
LEFT JOIN CTE h ON a.Appno = h.Appno AND h.rn = 1
You can use Coalesce() function to fill null values and order the select column list for your preference.
Here is a simplified code-sample of what I am suggesting:
SELECT a.AppNo, COALESCE(ah.Height, a.Height) AS Height
FROM App a
LEFT OUTER JOIN (
SELECT AppNo, COALESCE(Height, 'NULL') AS Height
FROM AppHistory
) ah
ON a.AppNo=ah.AppNo
I realize this leaves out a lot of the logic in your original question, I am only trying to illustrate the layered Coalesce technique I mentioned in my comment.

Drop rows identified within moving time window

I have a dataset of hospitalisations ('spells') - 1 row per spell. I want to drop any spells recorded within a week after another (there could be multiple) - the rationale being is that they're likely symptomatic of the same underlying cause. Here is some play data:
create table hif_user.rzb_recurse_src (
patid integer not null,
eventdate integer not null,
type smallint not null
);
insert into hif_user.rzb_recurse_src values (1,1,1);
insert into hif_user.rzb_recurse_src values (1,3,2);
insert into hif_user.rzb_recurse_src values (1,5,2);
insert into hif_user.rzb_recurse_src values (1,9,2);
insert into hif_user.rzb_recurse_src values (1,14,2);
insert into hif_user.rzb_recurse_src values (2,1,1);
insert into hif_user.rzb_recurse_src values (2,5,1);
insert into hif_user.rzb_recurse_src values (2,19,2);
Only spells of type 2 - within a week after any other - are to be dropped. Type 1 spells are to remain.
For patient 1, dates 1 & 9 should be kept. For patient 2, all rows should remain.
The issue is with patient 1. Spell date 9 is identified for dropping as it is close to spell date 5; however, as spell date 5 is close to spell date 1 is should be dropped therefore allowing spell date 9 to live...
So, it seems a recursive problem. However, I've not used recursive programming in SQL before and I'm struggling to really picture how to do it. Can anyone help? I should add that I'm using Teradata which has more restrictions than most with recursive SQL (only UNION ALL sets allowed I believe).
It's a cursor logic, check one row after the other if it fits your rules, so recursion is the easiest (maybe the only) way to solve your problem.
To get a decent performance you need a Volatile Table to facilitate this row-by-row processing:
CREATE VOLATILE TABLE vt (patid, eventdate, exac_type, rn, startdate) AS
(
SELECT r.*
,ROW_NUMBER() -- needed to facilitate the join
OVER (PARTITION BY patid ORDER BY eventdate) AS rn
FROM hif_user.rzb_recurse_src AS r
) WITH DATA ON COMMIT PRESERVE ROWS;
WITH RECURSIVE cte (patid, eventdate, exac_type, rn, startdate) AS
(
SELECT vt.*
,eventdate AS startdate
FROM vt
WHERE rn = 1 -- start with the first row
UNION ALL
SELECT vt.*
-- check if type = 1 or more than 7 days from the last eventdate
,CASE WHEN vt.eventdate > cte.startdate + 7
OR vt.exac_type = 1
THEN vt.eventdate -- new start date
ELSE cte.startdate -- keep old date
END
FROM vt JOIN cte
ON vt.patid = cte.patid
AND vt.rn = cte.rn + 1 -- proceed to next row
)
SELECT *
FROM cte
WHERE eventdate - startdate = 0 -- only new start days
order by patid, eventdate
I think the key to solving this is getting the first date more than 7 days from the current date and then doing a recursive subquery:
with rrs as (
select rrs.*,
(select min(rrs2.eventdate)
from hif_user.rzb_recurse_src rrs2
where rrs2.patid = rrs.patid and
rrs2.eventdate > rrs.eventdate + 7
) as eventdate7
from hif_user.rzb_recurse_src rrs
),
recursive cte as (
select patid, min(eventdate) as eventdate, min(eventdate7) as eventdate7
from hif_user.rzb_recurse_src rrs
group by patid
union all
select cte.patid, cte.eventdate7, rrs.eventdate7
from cte join
hif_user.rzb_recurse_src rrs
on rrs.patid = cte.patid and
rrs.eventdate = cte.eventdate7
)
select cte.patid, cte.eventdate
from cte;
If you want additional columns, then join in the original table at the last step.

T-SQL - Pivot by week

I'm currently trying to create a T-SQL, which runs through a list of deliveries in a table, and groups them by the Customer and the Depot - so each row will be
Customer, Depot, Total Value (sum of a column called Rate)
However, the customer would like the 'total value' split into the last 9 weeks - so rather than total value, we'll have columns like this:
22/01/2012 29/01/2012 05/02/2012 12/02/2012 19/02/2012 26/02/2012 04/03/2012 11/03/2012 18/03/2012
The dates would of course change for when they run the query - it'll just be the last 9 weeks. They also want a column for the Average of all these.
I understand pivot may help me but I'm a bit stumped on how to do this. Here's my current query:
SELECT d.Name AS 'Depot, s.Name AS 'Customer', SUM(c.Rates) AS 'Total Value'
FROM Deliveries AS c INNER JOIN Account AS s ON c.Customer = s.ID
INNER JOIN Depots AS d ON c.CollectionDepot = d.Letter
GROUP BY d.Name, s.Name
Many thanks!
EDIT: Here's a screenshot of the data currently - we won't need the 'total' column on the end, just there to show you. The 'Date' column is present in the Deliveries table and is called TripDate
Without knowing your exact data. It hard to predict what you are getting. But I can give you a suggestion of a solution.
Table structure
CREATE TABLE Deliveries
(
Customer INT,
CollectionDepot INT,
Rates FLOAT,
TripDate DATETIME
)
CREATE TABLE Account
(
Name VARCHAR(100),
ID INT
)
CREATE TABLE Depots
(
Name VARCHAR(100),
Letter INT
)
Test data
INSERT INTO Deliveries
VALUES
(1,1,452,GETDATE()-10),
(1,1,800,GETDATE()-30),
(1,1,7895,GETDATE()-2),
(1,1,451,GETDATE()-2),
(1,1,478,GETDATE()-89),
(1,1,4512,GETDATE()-31),
(1,1,782,GETDATE()-20),
(1,1,652,GETDATE()-5),
(1,1,752,GETDATE()-452)
INSERT INTO Account
VALUES
('Customer 1',1)
INSERT INTO Depots
VALUES
('Depot 1',1)
Table that contains the ranges and the formated date
CREATE TABLE #tmp
(
StartDate DATETIME,
EndDate DATETIME,
FomatedDate VARCHAR(20)
)
Calculate the date ranges
;WITH Nbrs ( n ) AS (
SELECT 0 UNION ALL
SELECT 1+n FROM Nbrs WHERE n < 8 )
INSERT INTO #tmp
SELECT
DATEADD(WEEK,-n-1,GETDATE()),
DATEADD(WEEK,-n,GETDATE()),
convert(varchar, DATEADD(WEEK,-n,GETDATE()), 112)
FROM
Nbrs
ORDER BY
-n
The date columns for the pivot
DECLARE #cols VARCHAR(MAX)
SELECT #cols = COALESCE(#cols + ','+QUOTENAME(FomatedDate),
QUOTENAME(FomatedDate))
FROM
#tmp
Declaring some dynamic sql and executing it
DECLARE #query NVARCHAR(4000)=
N'SELECT
*
FROM
(
SELECT
Depots.Name AS Depot,
Account.Name AS Customer,
Deliveries.Rates,
tmp.FomatedDate,
AVG(Deliveries.Rates) OVER(PARTITION BY 1) AS Average,
SUM(Deliveries.Rates) OVER(PARTITION BY 1) AS Total
FROM
Deliveries
JOIN Account
ON Deliveries.Customer = Account.ID
JOIN Depots
ON Deliveries.CollectionDepot = Depots.Letter
JOIN #tmp AS tmp
ON Deliveries.TripDate BETWEEN tmp.StartDate AND tmp.EndDate
) AS p
PIVOT
(
AVG(rates)
FOR FomatedDate IN ('+#cols+')
) AS pvt'
EXECUTE(#query)
And then cleaning up after myself.
DROP TABLE Deliveries
DROP TABLE Account
DROP TABLE Depots
DROP TABLE #tmp
You would have to make use of the PIVOT Keyword which is available in your version of SQL Server. I have outlined how your query should look, of course some tweaking will be required since it is difficult to test without having a copy of your data.
SELECT Depots.Name AS 'Depot', Account.Name, '22/01/2012', '29/01/2012', '05/02/2012', '12/02/2012',
FROM
(SELECT Name,
FROM Deliveries
INNER JOIN Account ON Deliveries.Customer = Account.ID
INNER JOIN Depots ON Account.CollectionDepot) AS Source
PIVOT
(
SUM(Deliveries.Rates)
FOR Date IN ('22/01/2012', '29/01/2012', '05/02/2012', '12/02/2012')
) AS 'Pivot Table'
For reference you could use this as a guide:
http://msdn.microsoft.com/en-us/library/ms177410.aspx

SQL Server: row present in one query, missing in another

Ok so I think I must be misunderstanding something about SQL queries. This is a pretty wordy question, so thanks for taking the time to read it (my problem is right at the end, everything else is just context).
I am writing an accounting system that works on the double-entry principal -- money always moves between accounts, a transaction is 2 or more TransactionParts rows decrementing one account and incrementing another.
Some TransactionParts rows may be flagged as tax related so that the system can produce a report of total VAT sales/purchases etc, so it is possible that a single Transaction may have two TransactionParts referencing the same Account -- one VAT related, and the other not. To simplify presentation to the user, I have a view to combine multiple rows for the same account and transaction:
create view Accounting.CondensedEntryView as
select p.[Transaction], p.Account, sum(p.Amount) as Amount
from Accounting.TransactionParts p
group by p.[Transaction], p.Account
I then have a view to calculate the running balance column, as follows:
create view Accounting.TransactionBalanceView as
with cte as
(
select ROW_NUMBER() over (order by t.[Date]) AS RowNumber,
t.ID as [Transaction], p.Amount, p.Account
from Accounting.Transactions t
inner join Accounting.CondensedEntryView p on p.[Transaction]=t.ID
)
select b.RowNumber, b.[Transaction], a.Account,
coalesce(sum(a.Amount), 0) as Balance
from cte a, cte b
where a.RowNumber <= b.RowNumber AND a.Account=b.Account
group by b.RowNumber, b.[Transaction], a.Account
For reasons I haven't yet worked out, a certain transaction (ID=30) doesn't appear on an account statement for the user. I confirmed this by running
select * from Accounting.TransactionBalanceView where [Transaction]=30
This gave me the following result:
RowNumber Transaction Account Balance
-------------------- ----------- ------- ---------------------
72 30 23 143.80
As I said before, there should be at least two TransactionParts for each Transaction, so one of them isn't being presented in my view. I assumed there must be an issue with the way I've written my view, and run a query to see if there's anything else missing:
select [Transaction], count(*)
from Accounting.TransactionBalanceView
group by [Transaction]
having count(*) < 2
This query returns no results -- not even for Transaction 30! Thinking I must be an idiot I run the following query:
select [Transaction]
from Accounting.TransactionBalanceView
where [Transaction]=30
It returns two rows! So select * returns only one row and select [Transaction] returns both. After much head-scratching and re-running the last two queries, I concluded I don't have the faintest idea what's happening. Any ideas?
Thanks a lot if you've stuck with me this far!
Edit:
Here are the execution plans:
select *
select [Transaction]
1000 lines each, hence finding somewhere else to host.
Edit 2:
For completeness, here are the tables I used:
create table Accounting.Accounts
(
ID smallint identity primary key,
[Name] varchar(50) not null
constraint UQ_AccountName unique,
[Type] tinyint not null
constraint FK_AccountType foreign key references Accounting.AccountTypes
);
create table Accounting.Transactions
(
ID int identity primary key,
[Date] date not null default getdate(),
[Description] varchar(50) not null,
Reference varchar(20) not null default '',
Memo varchar(1000) not null
);
create table Accounting.TransactionParts
(
ID int identity primary key,
[Transaction] int not null
constraint FK_TransactionPart foreign key references Accounting.Transactions,
Account smallint not null
constraint FK_TransactionAccount foreign key references Accounting.Accounts,
Amount money not null,
VatRelated bit not null default 0
);
Demonstration of possible explanation.
Create table Script
SELECT *
INTO #T
FROM master.dbo.spt_values
CREATE NONCLUSTERED INDEX [IX_T] ON #T ([name] DESC,[number] DESC);
Query one (Returns 35 results)
WITH cte AS
(
SELECT *, ROW_NUMBER() OVER (ORDER BY NAME) AS rn
FROM #T
)
SELECT c1.number,c1.[type]
FROM cte c1
JOIN cte c2 ON c1.rn=c2.rn AND c1.number <> c2.number
Query Two (Same as before but adding c2.[type] to the select list makes it return 0 results)
;
WITH cte AS
(
SELECT *, ROW_NUMBER() OVER (ORDER BY NAME) AS rn
FROM #T
)
SELECT c1.number,c1.[type] ,c2.[type]
FROM cte c1
JOIN cte c2 ON c1.rn=c2.rn AND c1.number <> c2.number
Why?
row_number() for duplicate NAMEs isn't specified so it just chooses whichever one fits in with the best execution plan for the required output columns. In the second query this is the same for both cte invocations, in the first one it chooses a different access path with resultant different row_numbering.
Suggested Solution
You are self joining the CTE on ROW_NUMBER() over (order by t.[Date])
Contrary to what may have been expected the CTE will likely not be materialised which would have ensured consistency for the self join and thus you assume a correlation between ROW_NUMBER() on both sides that may well not exist for records where a duplicate [Date] exists in the data.
What if you try ROW_NUMBER() over (order by t.[Date], t.[id]) to ensure that in the event of tied dates the row_numbering is in a guaranteed consistent order. (Or some other column/combination of columns that can differentiate records if id won't do it)
If the purpose of this part of the view is just to make sure that the same row isn't joined to itself
where a.RowNumber <= b.RowNumber
then how does changing this part to
where a.RowNumber <> b.RowNumber
affect the results?
It seems you read dirty entries. (Someone else deletes/insertes new data)
try SET TRANSACTION ISOLATION LEVEL READ COMMITTED.
i've tried this code (seems equal to yours)
IF object_id('tempdb..#t') IS NOT NULL DROP TABLE #t
CREATE TABLE #t(i INT, val INT, acc int)
INSERT #t
SELECT 1, 2, 70
UNION ALL SELECT 2, 3, 70
;with cte as
(
select ROW_NUMBER() over (order by t.i) AS RowNumber,
t.val as [Transaction], t.acc Account
from #t t
)
select b.RowNumber, b.[Transaction], a.Account
from cte a, cte b
where a.RowNumber <= b.RowNumber AND a.Account=b.Account
group by b.RowNumber, b.[Transaction], a.Account
and got two rows
RowNumber Transaction Account
1 2 70
2 3 70

Selecting date intervals, doing it fast, and always returning the latest entry with the result

I have a database with a table, storing changes in account-balance across a couple of accounts, with three columns;
float balance, #The account balance after the change
Date date, #Date that balance change occurred
int aid #Account that the balance change occurred on
It contains a couple of entries for each day of the year, and I want to retrieve the balance of every five days. I also want it to separate between accounts (ie if two changes
occurred on the same day, but on separate accounts, return both).
The problem is this: Sometimes there will be several days (or weeks) where there is no data available. When that occurs, I want to make sure to return the latest entry before the "hole" in the dataset. This is a simplified version of the problem, the actual database is big (several gigabytes), the size is the reason why I want to return a subset of the data. It cannot use platform specific methods, because it needs to work on both oracle and mySQL.
My question is: Is there any way to do this fast? I would be able to write a query that gets the job done, but I am hoping there is some devil magic way of doing it that does not require lots of nested queries and aggregate functions..
I would use Andomar's Period table idea, but I would try a slightly different final query. This assumes that your Account_Balances table has a PK on aid and date. If you ended up with two balances for the same account for the same exact date and time then you would get some duplicate rows.
SELECT
P.start_date,
P.end_date,
AB1.account_id,
AB1.balance
FROM
Periods P
LEFT OUTER JOIN Account_Balances AB1 ON
AB1.date <= P.end_date
LEFT OUTER JOIN Account_Balances AB2 ON
AB2.aid = AB1.aid AND
AB2.date > AB1.date AND
AB2.date <= P.end_date
WHERE
AB2.aid IS NULL
If the account has no rows before or during the given period you will not get a row back for it.
You can do this in a relatively straightforward way by creating a period table, which you can join with the accounts table to create one row per account per period.
Here's an example. Let's set up some temporary tables:
create table #balance (
id int identity,
balance float,
date datetime,
aid int
)
create table #period (
id int identity,
startdt datetime,
enddt datetime
)
Enter some test data:
insert into #yourtable (balance, date, aid) values (4,'2009-01-01',1)
insert into #yourtable (balance, date, aid) values (5,'2009-01-10',1)
insert into #yourtable (balance, date, aid) values (6,'2009-01-10',1)
insert into #yourtable (balance, date, aid) values (7,'2009-01-16',1)
insert into #yourtable (balance, date, aid) values (2,'2009-01-01',2)
insert into #yourtable (balance, date, aid) values (3,'2009-01-10',2)
insert into #yourtable (balance, date, aid) values (4,'2009-01-10',2)
insert into #yourtable (balance, date, aid) values (5,'2009-01-16',2)
insert into #period (startdt, enddt) values ('2009-01-01','2009-01-06')
insert into #period (startdt, enddt) values ('2009-01-06','2009-01-11')
insert into #period (startdt, enddt) values ('2009-01-11','2009-01-16')
insert into #period (startdt, enddt) values ('2009-01-16','2009-01-21')
Now let's query all periods:
from #period p
Add one row for each balance before the end of the period:
left join #balance b1 on
b1.date <= p.enddt
Search for balances in between the balance from the first join, and the end of the period:
left join #balance b2 on
b2.aid = b1.aid
and b1.id < b2.id
and b2.date <= p.enddt
Then filter out the rows that are not the last balance for their period.
where
b2.aid is null
The b2 join basically looks for the "in-between" value, and by saying it's id is null, you say no in-between row exists. The final query looks like this:
select
b1.aid
, p.startdt
, b1.balance
from #period p
left join #balance b1 on
b1.date <= p.enddt
left join #balance b2 on
b2.aid = b1.aid
and b1.id < b2.id
and b2.date <= p.enddt
where
b2.aid is null
order by b1.aid, p.startdt
Note: the queries assume a balance with a later date always has a larger id. If you never have to balances with exactly the same end date, you can replace "b1.id < b2.id" with "b1.date < b2.date".
If you wait for postgresql 8.4 you might be able to make use of Window Functions
http://www.postgresql.org/docs/8.4/static/tutorial-window.html
http://www.postgresql.org/docs/8.4/static/functions-window.html