BigQuery - incremental re-partition from ingest to business date in consumption layer - google-bigquery

I'm a newbie to BigQuery and trying to figure out a solution to this scenario:
Transaction data is ingested to BQ table that is partitioned on ingest_date
Business key for a transaction is trade_id but a transaction can also have versions so business key for a record is trade_id + trade_version
There may be duplicates on business key so the technical key is trade_id + trade_version_id + ingest_timestamp
Transaction contains a business timestamp when it was actually executed but this may differ from ingestion timestamp as data may arrive late up to several days or weeks
The data is to be enriched and transformed in several steps and finally end up in a table available for consumption downstream
Consumption table must be partitioned on business date for good performance on filtering and also contain a flag to enable filtering on latest version only
Have googled quite a lot on this but haven't seen any clear solution or pattern to use so I did some elaboration on my own and came up with a solution that it would be great to get some reviews and comments on (not 100% sure it works).
Also read briefly about DBT and Dataform, have these tools automated solutions to this?
Here it is:
The small data model I've worked with: https://i.stack.imgur.com/P1baT.jpg
---------------------------------------------------------------------------------------------
-- Table DDL's
CREATE OR REPLACE TABLE `<dataset>.raw`
(
raw_key STRING,
ingest_date DATE,
ingest_timestamp TIMESTAMP,
trade_id STRING,
trade_version STRING,
business_date DATE,
business_timestamp TIMESTAMP
)
PARTITION BY ingest_date
OPTIONS (
require_partition_filter=true
)
CREATE OR REPLACE TABLE `<dataset>.partition_ingest2business`
(
ingest_date DATE,
business_date DATE,
ingest_timestamp TIMESTAMP
)
CREATE OR REPLACE TABLE `<dataset>.consume`
(
trade_id STRING,
trade_version STRING,
business_date DATE,
latest_version BOOL,
raw_key STRING,
ingest_date DATE,
version_desc INT64
)
PARTITION BY business_date
OPTIONS (
require_partition_filter=true
)
CREATE OR REPLACE TABLE `<dataset>.consume_ctrl`
(
partition_ingest_timestamp TIMESTAMP
)
---------------------------------------------------------------------------------------------
-- Some test data
/*
delete from <dataset>.raw where ingest_date > '1899-01-01';
delete from <dataset>.transform where ingest_date > '1899-01-01';
delete from <dataset>.partition_ingest2business where ingest_date > '1899-01-01';
*/
insert into <dataset>.raw select 'raw_key_001', cast('2000-01-01' as date), cast('2000-01-01 00:00:01' as timestamp), 'trade_id_001', 'trade_version_001_01', cast('2000-01-01' as date), cast('2000-01-01 00:00:01' as timestamp);
insert into <dataset>.raw select 'raw_key_002', cast('2000-01-01' as date), cast('2000-01-01 00:00:02' as timestamp), 'trade_id_002', 'trade_version_002_01', cast('2000-01-01' as date), cast('2000-01-01 00:00:02' as timestamp);
insert into <dataset>.raw select 'raw_key_003', cast('2000-01-03' as date), cast('2000-01-03 00:00:01' as timestamp), 'trade_id_003', 'trade_version_003_01', cast('2000-01-03' as date), cast('2000-01-03 00:00:01' as timestamp);
insert into <dataset>.raw select 'raw_key_004', cast('2000-01-03' as date), cast('2000-01-03 00:00:02' as timestamp), 'trade_id_004', 'trade_version_004_01', cast('2000-01-03' as date), cast('2000-01-03 00:00:02' as timestamp);
insert into <dataset>.raw select 'raw_key_005', cast('2000-01-03' as date), cast('2000-01-03 00:00:03' as timestamp), 'trade_id_005', 'trade_version_005_01', cast('2000-01-03' as date), cast('2000-01-03 00:00:03' as timestamp);
insert into <dataset>.raw select 'raw_key_006', cast('2000-01-03' as date), cast('2000-01-03 00:00:04' as timestamp), 'trade_id_006', 'trade_version_006_01', cast('2000-01-03' as date), cast('2000-01-03 00:00:04' as timestamp);
---------------------------------------------------------------------------------------------
-- Incremental load of partition_ingest2business table
declare max_ingest_timestamp timestamp default
(
select max(ingest_timestamp)
from <dataset>.partition_ingest2business
);
if max_ingest_timestamp is null then
-- Destination table is empty, set partiton filer from source table
set max_ingest_timestamp = cast('1899-01-01' as timestamp);
end if;
select max_ingest_timestamp, cast(max_ingest_timestamp as date);
select
ingest_date
,business_date
,max(ingest_timestamp)
from <dataset>.raw
where
ingest_date >= cast(max_ingest_timestamp as date)
group by
ingest_date
,business_date;
-- Merge into destination table
merge <dataset>.partition_ingest2business trgt
using
(
select
ingest_date
,business_date
,max(ingest_timestamp) as ingest_timestamp
from <dataset>.raw
where
ingest_date >= cast(max_ingest_timestamp as date)
and ingest_timestamp > max_ingest_timestamp
group by
ingest_date
,business_date
) src
on trgt.ingest_date = src.ingest_date and trgt.business_date = src.business_date
when not matched then
insert(ingest_date, business_date, ingest_timestamp) values (src.ingest_date, src.business_date, src.ingest_timestamp)
when matched and trgt.ingest_timestamp != src.ingest_timestamp then
update set ingest_timestamp = src.ingest_timestamp;
---------------------------------------------------------------------------------------------
-- Incremental load of consume table with partition overwrite
declare ingest_date_filter array<date>;
declare business_date_filter array<date>;
declare partition_ingest_timestamp timestamp default
(
select partition_ingest_timestamp
from <dataset>.consume_ctrl
);
declare max_ingest_timestamp timestamp default
(
select max(ingest_timestamp)
from <dataset>.partition_ingest2business
);
if partition_ingest_timestamp is null then
set partition_ingest_timestamp = cast('1899-01-01' as timestamp);
insert into <dataset>.consume_ctrl (partition_ingest_timestamp) values (cast('1899-01-01' as timestamp));
end if;
set business_date_filter =
(
select ARRAY_AGG(distinct business_date)
from <dataset>.partition_ingest2business
where
ingest_timestamp > partition_ingest_timestamp
and ingest_timestamp <= max_ingest_timestamp
);
set ingest_date_filter =
(
select ARRAY_AGG(distinct ingest_date)
from <dataset>.partition_ingest2business
where business_date in UNNEST(business_date_filter)
);
if ARRAY_LENGTH(business_date_filter) > 0 then
merge <dataset>.consume trgt using
(
select
trade_id
,trade_version
,business_date
,row_number() over (partition by trade_id order by trade_version desc, business_timestamp desc, ingest_timestamp desc) as version_desc
,raw_key
,ingest_date
from <dataset>.raw
where
ingest_date in unnest(ingest_date_filter)
and ingest_timestamp <= max_ingest_timestamp
) src
on false
when not matched and business_date in unnest(business_date_filter) then
insert(trade_id, trade_version, business_date, version_desc, raw_key, ingest_date) values(trade_id, trade_version, business_date, version_desc, raw_key, ingest_date)
when not matched by source and business_date in unnest(business_date_filter) then
delete;
-- Update consume_ctrl with max_ingest_timestamp to be used next execution
update <dataset>.consume_ctrl set partition_ingest_timestamp = max_ingest_timestamp where 1 = 1;
end if;

You haven't asked a specific question(s) but some responses...
As you are aware I think, you are using standard partitioning not ingestion-time partitioning which would look like this...
PARTITION BY _PARTITIONDATE
I am a bit wary of row level operations like merge on BigQuery on big fact tables although performance improvements have been released in recent months. Analytical columnar databases generally excel in append rather than merge use cases
Due to currently limitations of BigQuery materialized views (partitioning must be same as underlying table) you do indeed need separate tables if you wish to have both PARTITION BY ingest_date and business_date tables available
For .consume you could add clustering, it does have performance/on-demand cost benefits, sometimes massive query cost reductions depending on cardinality of the chosen columns
PARTITION BY business_date
CLUSTER BY latest_version
I'm more familiar with Dataform (than DBT) which does not automate complex transformations except where Packages are provided and/or you write your own in javascript. Here is a link explaining how to solve a common complex transformation problem SCD's by utilising a SCD package provided by dataform
https://docs.dataform.co/packages/dataform-scd
Dataform is excellent for refactoring/documentation/assertions/testing/deployment SQL tranforms (using SQLX). New Dataform Cloud users are put on a waitlist since May 12 after acquistion by GCP however you can still install Dataform and use Dataform CLI.

Related

Making a foreign key reference only a subset of rows in the referenced table

I'm using MS SQL SERVER 2019.
In my database I have a table Events which stores events offered by a club
Events (Theme, Event_Date, Place, Event_Hour, Type)
Then I also have a table Reservations which stores reservations for those events:
Reservations (Id, Event_Theme) FK(Event_Theme) --> PK(Events.Theme)
My goal is to allow insertion of new rows in Reservations only if Event_Theme for that row is a theme of a future event (i.e one with Event_Date > CURRENT DATE): obviously reservations are not allowed for past events.
My attempt was the creation, inside Reservations table's creation, of the following check constraint:
CONSTRAINT CHK_Reservations_Event_Theme CHECK (Event_Theme IN (SELECT STRING_AGG(E.Theme, ',') WITHIN GROUP (ORDER BY E.Theme ASC)
FROM Events E
WHERE E.Event_Date>=(CAST( GETDATE() AS Date))))
But I then remembered that subqueries aren't supported for check constraints
What is an alternative way to implement the same logic?
As you can see in the example event number 2 is outdated, so a new row is not inserted.
Event number 1 can be inserted.
you can add more and more conditions to the WHERE for example if only 100 reservations can be made
CREATE TABLE Events (Theme int , Event_Date date, Place varchar(4), Event_Hour int, Type int)
INSERT INTO Events VALUES(1, DATEADD(month, 1, getdate()), 'A',1,1),
(2, DATEADD(day, -1, getdate()), 'B',2,2)
CREATE tABLE Reservations (Id int, Event_Theme int)
CREATE TRIGGER tr_Reservations ON Reservations
INSTEAD OF INSERT
AS
BEGIN
INSERT INTO [Reservations](
[Id],
[Event_Theme])
SELECT Id,[Event_Theme]
FROM INSERTED
WHERE Event_Theme IN (SELECT STRING_AGG(E.Theme, ',') WITHIN GROUP (ORDER BY E.Theme ASC)
FROM [Events] E
WHERE E.Event_Date>=(CAST( GETDATE() AS Date)));
PRINT 'success.';
END
INSERT INTO Reservations VALUES (1,1),(2,2)
success.
3 rows affected
SELECT * FROM Reservations
Id | Event_Theme
-: | ----------:
1 | 1
db<>fiddle here
I found also a way without using a trigger.
I created the following stored function:
CREATE FUNCTION usf_CHECK_THEME(#EventTheme VARCHAR(45))
RETURNS VARCHAR(45)
BEGIN
DECLARE #Theme VARCHAR(45);
SET #Theme =NULL;
SET #Theme = (SELECT E.Theme
FROM Events E
WHERE E.Theme=#EventTheme AND E.Event_Date>(CAST(GETDATE() AS Date)));
RETURN #Theme;
END;
Theme is PK In table Events.
And then in the check constraint:
CONSTRAINT CHK_Reservations_Event_Theme CHECK (usf_CHECK_THEME(Event_Theme) IS NOT NULL)

Search for closes index on SQL table

I have a hypothetical SQL table "EVENTS", with two columns, a UUID index column, and a DateTime column,
The table is populated with values ranging from 1900-01-01 to today, it is not ordered, there are numerous dates missing.
The query that I have to run is basically 'retrieve all events that happened at the requested date (start to the end of the day) or the closest previous date'
If I were looking for all events in a day that I know that exists in the database it would be something as simple as:
SELECT * FROM Events e
WHERE
e.date BETWEEN $START_OF_DAY AND $END_OF_DAY;
But if that date doesn't exist I must retrieve the latest date up to the requested date.
Grab current day, but if no records found, will return all records from the nearest previous day with records.
So in my sample data, Jan 2 returns 3 events dated Jan 1
SQL Server Solution
DECLARE #Input DATE = '2022-01-02' /*Try Jan 1,2,3, or 4*/
DROP TABLE IF EXISTS #Event
CREATE TABLE #Event (ID INT IDENTITY(1,1),EventDateTime DATETIME)
INSERT INTO #Event
VALUES
('2022-01-01 08:00')
,('2022-01-01 09:00')
,('2022-01-01 10:00')
,('2022-01-03 12:00')
SELECT TOP (1) WITH TIES *
FROM #Event AS A
CROSS APPLY (SELECT EventDate = CAST(EventDateTime AS DATE)) AS B
WHERE B.EventDate <= #Input
ORDER BY B.EventDate DESC
SQL Fiddle wasn't letting me create a variable, but here's a the code conceptually for a more efficient version for MySQL. It grabs the desired date range in the first query, then uses it to filter in the second query. I think it should perform far better than the accepted answer assuming you have an index on EventDateTime
CREATE TABLE Event (
ID MEDIUMINT NOT NULL AUTO_INCREMENT
,EventDateTime DATETIME
,PRIMARY KEY (ID));
INSERT INTO Event (EventDateTime)
VALUES
('2022-01-01 08:00')
,('2022-01-01 09:00')
,('2022-01-01 10:00')
,('2022-01-03 12:00');
/*Need to save these off to variables to use in later query*/
SELECT TIMESTAMP(CAST(EventDateTime AS DATE)) AS StartRange
,TIMESTAMP(CAST(EventDateTime AS DATE)) + INTERVAL 1 DAY AS EndRange
FROM Event
WHERE EventDateTime < DATE_ADD('2022-01-04' /*Input*/,INTERVAL 1 DAY)
ORDER BY EventDateTime DESC
LIMIT 1;
SELECT *
FROM Event
WHERE EventDateTime >= StartRange
AND EventDateTime < EndRange
Calculate the most recent date, and do a self join. Although I'm using MYSQL, I believe this is the most generic workaround
CREATE TABLE d0207Event (ID INT ,EventDateTime DATETIME)
INSERT INTO d0207Event
VALUES
(1,'2022-01-01 08:00')
,(2,'2022-01-01 09:00')
,(3,'2022-01-01 10:00')
,(4,'2022-01-03 12:00')
INSERT INTO d0207Event
VALUES
(5, '2021-12-12 08:00');
select t1.*
from d0207Event t1,
(
select min(t1.dat) mindat
from (
select t1.*,
DATEDIFF('2022-01-02', cast(t1.EventDateTime as date)) dat
from d0207Event t1
) t1
where t1.dat >= 0
) t2
where DATEDIFF('2022-01-02', cast(t1.EventDateTime as date)) = t2.mindat
;
There are also many advanced syntaxes that can solve this problem better, depending on which DB you use and your specific application scenario
It seems that you can also choose a database with more syntax, then using an analytic function usually solves the efficiency problem well, since the EVENT table only needs to be queried once.
CREATE TABLE Event (
ID MEDIUMINT NOT NULL AUTO_INCREMENT
,EventDateTime DATETIME
,PRIMARY KEY (ID));
INSERT INTO Event (EventDateTime)
VALUES
('2022-01-01 08:00')
,('2022-01-01 09:00')
,('2022-01-01 10:00')
,('2022-01-03 12:00');
select *
from (
select t1.*,
first_value(cast(t1.EventDateTime as date))
over(order by cast(t1.EventDateTime as date) desc) fv
from event t1
where cast(t1.EventDateTime as date) <= '2022-01-03'
) t1
where cast(t1.EventDateTime as date) = fv
Creating a functional index cast(t1.EventDateTime as date), or creating a virtual column directly can make the query easier, otherwise using date_add() is a good way

SCD Type 2 - Handling Intraday changes?

I have a merge statement that builds my SCD type 2 table each night. This table must house all historical changes made in the source system and create a new row with the date from/date to columns populated along with the "islatest" flag. I have come across an issue today that I am not really sure how to handle.
There looks to have been multiple changes to the source table within a 24 hour period.
ID Code PAN EnterDate Cost Created
16155 1012401593331 ENRD 2015-11-05 7706.3 2021-08-17 14:34
16155 1012401593331 ENRD 2015-11-05 8584.4 2021-08-17 16:33
I use a basic merge statement to identify my changes however what would be the best approach to ensure all changes get picked up correctly? The above is giving me an error as it's trying to insert/update multiple rows with the same value
DECLARE #DateNow DATETIME = Getdate()
IF Object_id('tempdb..#meteridinsert') IS NOT NULL
DROP TABLE #meteridinsert;
CREATE TABLE #meteridinsert
(
meterid INT,
change VARCHAR(10)
);
MERGE
INTO [DIM].[Meters] AS target
using stg_meters AS source
ON target.[ID] = source.[ID]
AND target.latest=1
WHEN matched THEN
UPDATE
SET target.islatest = 0,
target.todate = #Datenow
WHEN NOT matched BY target THEN
INSERT
(
id,
code,
pan,
enterdate,
cost,
created,
[FromDate] ,
[ToDate] ,
[IsLatest]
)
VALUES
(
source.id,
source.code ,
source.pan ,
source.enterdate ,
source.cost ,
source.created ,
#Datenow ,
NULL ,
1
)
output source.id,
$action
INTO #meteridinsert;INSERT INTO [DIM].[Meters]
(
[id] ,
[code] ,
[pan] ,
[enterdate] ,
[cost] ,
[created] ,
[FromDate] ,
[ToDate] ,
[IsLatest]
)
SELECT ([id] ,[code] ,[pan] ,[enterdate] ,[cost] ,[created] , #DateNow ,NULL ,1 FROM stg_meters a
INNER JOIN #meteridinsert cid
ON a.id = cid.meterid
AND cid.change = 'UPDATE'
Maybe you can do it using merge statement, but I would prefer to use typicall update and insert approach in order to make it easier to understand (also I am not sure that merge allows you to use the same source record for update and insert...)
First of all I create the table dimscd2 to represent your dimension table
create table dimscd2
(naturalkey int, descr varchar(100), startdate datetime, enddate datetime)
And then I insert some records...
insert into dimscd2 values
(1,'A','2019-01-12 00:00:00.000', '2020-01-01 00:00:00.000'),
(1,'B','2020-01-01 00:00:00.000', NULL)
As you can see, the "current" is the one with descr='B' because it has an enddate NULL (I do recommend you to use surrogate keys for each record... This is just an incremental key for each record of your dimension, and the fact table must be linked with this surrogate key in order to reflect the status of the fact in the moment when happened).
Then, I have created some dummy data to represent the source data with the changes for the same natural key
-- new data (src_data)
select 1 as naturalkey,'C' as descr, cast('2020-01-02 00:00:00.000' as datetime) as dt into src_data
union all
select 1 as naturalkey,'D' as descr, cast('2020-01-03 00:00:00.000' as datetime) as dt
After that, I have created a temp table (##tmp) with this query to set the enddate for each record:
-- tmp table
select naturalkey, descr, dt,
lead(dt,1,0) over (partition by naturalkey order by dt) enddate,
row_number() over (partition by naturalkey order by dt) rn
into ##tmp
from src_data
The LEAD function takes the next start date for the same natural key, ordered by date (dt).
The ROW_NUMBER marks with 1 the oldest record in the source data for the natural key in the dimension.
Then, I proceed to close the "current" record using update
update d
set enddate = t.dt
from dimscd2 d
join ##tmp t
on d.naturalkey = t.naturalkey
and d.enddate is null
and t.rn = 1
And finally I add the new source data to the dimension with insert
insert into dimscd2
select naturalkey, descr, dt,
case enddate when '1900-00-00' then null else enddate end
from ##tmp
Final result is obtained with the query:
select * from dimscd2
You can test on this db<>fiddle

Search By Time (only) in DateTime SQL column

I am working with SQL Server & PHP and using stored procedures.
I have a table called myTable. I has a column start_time (DateTime Format).
start_time
-----------------------
2019-05-23 12:20:22.000
2019-08-02 01:21:02.000
2019-02-10 22:32:17.000
2019-08-14 04:56:24.000
I want to filter results by time only.
For-example: BETWEEN 22:20:10.000 AND 04:56:24.000
But, It's not Working.
Simple casting to time datatype will work:
select * from myTable
where cast(start_time as time) >= '22:00:00.000'
or cast(start_time as time) <= '04:00:00.000'
Note that applying a CAST function to the start_time column in the WHERE clause predicate will prevent an index that column from being used efficiently. A full table scan will be required unless other criteria are specified.
this code will work please check
create table #temp
(
[Date] datetime
)
insert into #temp values ('2019-05-23 12:20:22.000')
insert into #temp values ('2019-08-02 01:21:02.000')
insert into #temp values ('2019-02-10 22:32:17.000')
insert into #temp values ('2019-08-14 04:56:24.000')
select cast([Date] as date) as [Date],convert(char(15), [Date], 108) [Time]
from #temp
where convert(char(15), [Date], 108) between '04:56:24' and '22:32:17'
Drop table #temp

SQL Query > sysdate <Sysdate + 1 year

I am trying to write a query that will find the max expiration date but what i noticed is when I am doing this I get no results if I have a expiration date lets say 30-Dec-16 and for the same part I also have an expiration date of 01-Jan-2099 (which is the default date if nothing is filled in) below is my query how could I rewrite the expiration_date query to get the correct date.
SELECT
Part,
price,
effective_date,
expiration_date
FROM a.Table
WHERE Part IN ('&Part')
AND PRICE IN ('somewere')
AND expiration_date IN (SELECT
MAX(expiration_date)
FROM table
WHERE expiration_date > SYSDATE
AND part IN ('&Part)
AND PRICE IN (Somewere))
AND to_date(effective_date) IN (SELECT
MAX(EFFECTIVE_DATE) FROM b.table
WHERE expiration_date > SYSDATE
AND Part IN ('&Part)
AND price IN (somewere)
AND EFFECTIVE_DATE < SYSDATE + 1)
I would use ROW_NUMBER. https://docs.oracle.com/cd/B19306_01/server.102/b14200/functions137.htm
Here is the query:
SELECT
part
,price
,effective_date
,expieration_date
FROM (
SELECT
part
,price
,effective_date
,expieration_date
,ROW_NUMBER() OVER (PARTITION BY part ORDER BY expieration_date DESC) AS "row"
FROM #tbl
WHERE effective_date < SYSDATE + 1
) tbl
WHERE "row" = 1
Here is what I used to populate #tbl.
DECLARE #tbl TABLE (
part NVARCHAR(MAX)
,price FLOAT
,effective_date DATETIME2(3)
,expieration_date DATETIME2(3)
)
INSERT #tbl (part, PRICE, EFFECTIVE_DATE, EXPIERATION_DATE)
VALUES ('Apples',7.95,'2016-12-01','2016-12-30')
,('Apples',7.95,'2016-11-01','2016-11-30')
,('Apples',7.95,'2016-12-30','2099-01-01')