How to handle late arriving dimension and NULL business key in Kimball Data Mart?

jrara picture jrara · Oct 12, 2012 · Viewed 7.7k times · Source

I'm trying to implement Kimball data mart which uses -1 and -2 rows in dimension table for late arriving dimension and for NULL business key. I have an example code below which creates one staging table for fact and dimension data, two dimension tables and one fact table for data mart. Here is my example code with data in SQL:

--drop table stg_sales
--go
CREATE TABLE dbo.stg_sales
  (
     stg_sales_id       INT IDENTITY(1, 1) NOT NULL,
     sales_number       INT NOT NULL,
     sales_amt          INT NULL,
     cust_number        INT NULL,
     cust_firstname     NVARCHAR(50) NULL,
     cust_lastname      NVARCHAR(100) NULL,
     cust_address       NVARCHAR(500) NULL,
     salesperson_number INT NULL,
     CONSTRAINT pk_stg_sales PRIMARY KEY (stg_sales_id)
  )

go

INSERT stg_sales
       (sales_number,
        sales_amt,
        cust_number,
        cust_firstname,
        cust_lastname,
        cust_address,
        salesperson_number)
VALUES (123,
        434,
        2342,
        'Jim',
        'Moriaty',
        'something',
        23)

INSERT stg_sales
       (sales_number,
        sales_amt,
        cust_number,
        cust_firstname,
        cust_lastname,
        cust_address,
        salesperson_number)
VALUES (124,
        234,
        2342,
        'Jim',
        'Moriaty',
        'something',
        23)

INSERT stg_sales
       (sales_number,
        sales_amt,
        cust_number,
        cust_firstname,
        cust_lastname,
        cust_address,
        salesperson_number)
VALUES (125,
        434,
        4545,
        'Joe',
        'Esk',
        'someother',
        24)

INSERT stg_sales
       (sales_number,
        sales_amt,
        cust_number,
        cust_firstname,
        cust_lastname,
        cust_address,
        salesperson_number)
VALUES (126,
        434,
        5555,
        'Daniel',
        'Hart',
        'Someaddr',
        NULL) --salesperson_number business key missing here

INSERT stg_sales
       (sales_number,
        sales_amt,
        cust_number,
        cust_firstname,
        cust_lastname,
        cust_address,
        salesperson_number)
VALUES (127,
        333,
        4444,
        'Pat',
        'Smith',
        'Someaddr',
        30)

SELECT *
FROM   stg_sales

--create a dimension and fact tables
--drop table dbo.dim_customer
--go
CREATE TABLE dbo.dim_customer
  (
     customer_wid   INT IDENTITY(1, 1) NOT NULL,
     cust_number    INT NULL,
     cust_firstname NVARCHAR(50) NULL,
     cust_lastname  NVARCHAR(100) NULL,
     cust_address   NVARCHAR(500) NULL,
     date_insert    DATETIME2 NOT NULL DEFAULT (Getdate()),
     date_update    DATETIME2 NULL,
     is_current     BIT NOT NULL
     CONSTRAINT pk_dim_customer PRIMARY KEY (customer_wid)
     CONSTRAINT chk_is_current CHECK (is_current IN (0, 1))
  )

go

SET IDENTITY_INSERT dbo.dim_customer ON

INSERT dbo.dim_customer
       (customer_wid,
        cust_number,
        cust_firstname,
        cust_lastname,
        cust_address,
        date_insert,
        date_update,
        is_current)
VALUES (-1,
        -1,
        'unknown',
        'unknown',
        'unknown',
        Getdate(),
        Getdate(),
        1)

INSERT dbo.dim_customer
       (customer_wid,
        cust_number,
        cust_firstname,
        cust_lastname,
        cust_address,
        date_insert,
        date_update,
        is_current)
VALUES (-2,
        -2,
        'Error',
        'Error',
        'Error',
        Getdate(),
        Getdate(),
        1)

SET IDENTITY_INSERT dbo.dim_customer OFF

--insert data into dimension table
INSERT dbo.dim_customer
       (cust_number,
        cust_firstname,
        cust_lastname,
        cust_address,
        is_current)
SELECT DISTINCT cust_number,
                cust_firstname,
                cust_lastname,
                cust_address,
                1 AS is_current
FROM   dbo.stg_sales
WHERE  cust_number <> 4444 --left one record off to simulate the situation where you don't have corrensponding row in dim table (late arriving dimension)
SELECT *
FROM   dbo.dim_customer

DROP TABLE dbo.dim_salesperson

--create salesperson table
CREATE TABLE dbo.dim_salesperson
  (
     salesperson_wid       INT IDENTITY(1, 1) NOT NULL,
     salesperson_number    INT NULL,
     salesperson_firstname NVARCHAR(50) NULL,
     salesperson_lastname  NVARCHAR(100) NULL,
     salesperson_address   NVARCHAR(500) NULL,
     date_insert           DATETIME2 NOT NULL DEFAULT (Getdate()),
     date_update           DATETIME2 NULL,
     is_current            BIT NOT NULL
     CONSTRAINT pk_dim_salesperson PRIMARY KEY (salesperson_wid)
     CONSTRAINT chk_dim_salesperson_is_current CHECK (is_current IN (0, 1))
  )

go

SET IDENTITY_INSERT dbo.dim_salesperson ON

INSERT dbo.dim_salesperson
       (salesperson_wid,
        salesperson_number,
        salesperson_firstname,
        salesperson_lastname,
        salesperson_address,
        date_insert,
        date_update,
        is_current)
VALUES (-1,
        -1,
        'Not available',
        'Not available',
        'Not available',
        Getdate(),
        Getdate(),
        1)

INSERT dbo.dim_salesperson
       (salesperson_wid,
        salesperson_number,
        salesperson_firstname,
        salesperson_lastname,
        salesperson_address,
        date_insert,
        date_update,
        is_current)
VALUES (-2,
        -2,
        'Error',
        'Error',
        'Error',
        Getdate(),
        Getdate(),
        1)

SET IDENTITY_INSERT dbo.dim_salesperson OFF

--insert data into salesperson
INSERT dbo.dim_salesperson
       (salesperson_number,
        salesperson_firstname,
        salesperson_lastname,
        salesperson_address,
        is_current)
VALUES (23,
        'John',
        'Fox',
        'something',
        1)

INSERT dbo.dim_salesperson
       (salesperson_number,
        salesperson_firstname,
        salesperson_lastname,
        salesperson_address,
        is_current)
VALUES (24,
        'Hadley',
        'Fox',
        'something',
        1)

INSERT dbo.dim_salesperson
       (salesperson_number,
        salesperson_firstname,
        salesperson_lastname,
        salesperson_address,
        is_current)
VALUES (30,
        'Ashley',
        'Fox',
        'something',
        1)

SELECT *
FROM   dbo.dim_salesperson

SELECT *
FROM   dbo.stg_sales

--create and populate the fact table
--drop table dbo.f_sales
--go
CREATE TABLE dbo.f_sales
  (
     sales_number    INT NOT NULL,
     customer_wid    INT NOT NULL,
     salesperson_wid INT NOT NULL,
     sales_amt       INT NULL
     CONSTRAINT pk_f_sales PRIMARY KEY (sales_number)
     CONSTRAINT fk_customer_wid FOREIGN KEY (customer_wid) REFERENCES
     dbo.dim_customer(customer_wid),
     CONSTRAINT fk_salesperson_wid FOREIGN KEY (salesperson_wid) REFERENCES
     dbo.dim_salesperson(salesperson_wid)
  )

--populate the fact table 
INSERT dbo.f_sales
       (sales_number,
        customer_wid,
        salesperson_wid,
        sales_amt)
SELECT stg.sales_number,
       Isnull(dimcust.customer_wid, -1)  AS customer_wid,
       --this is maybe correct way to assign -1 foreign key when there is no corresponding dimension row in the dim table
       Isnull(dimsp.salesperson_wid, -2) AS salesperson_wid,
       --NOT CORRECT, how to assign -2 foreign key when the business key is NULL in the source?
       stg.sales_amt
FROM   dbo.stg_sales AS stg
       LEFT JOIN dbo.dim_customer AS dimcust
              ON stg.cust_number = dimcust.cust_number
       LEFT JOIN dbo.dim_salesperson AS dimsp
              ON stg.salesperson_number = dimsp.salesperson_number

SELECT *
FROM   dbo.f_sales

How can I assign -2 for the rows in which the business key is missing in the source system. You can read more about the theory behind this implementation from Kimball:

This is basically what I'm trying to achieve:

Handling NULL in dimensions and measures

Edit:

I think I can use a COALESCE or ISNULL in the left join, it seems to produce the correct result:

INSERT dbo.f_sales
       (sales_number,
        customer_wid,
        salesperson_wid,
        sales_amt)
SELECT stg.sales_number,
       Isnull(dimcust.customer_wid, -1)  AS customer_wid,
       --this is maybe correct way to assign -1 foreign key when there is no corresponding dimension row in the dim table
       dimsp.salesperson_wid,
       stg.sales_amt
FROM   dbo.stg_sales AS stg
       LEFT JOIN dbo.dim_customer AS dimcust
              ON COALESCE(stg.cust_number, -2) = dimcust.cust_number
       LEFT JOIN dbo.dim_salesperson AS dimsp
              ON COALESCE(stg.salesperson_number, -2) = dimsp.salesperson_number

Answer

Damir Sudarevic picture Damir Sudarevic · Oct 12, 2012

Purely as a lookup technique

-- add nullable keys to the staging table
alter table dbo.stg_sales ADD
  sales_person_wid integer null
, customer_wid     integer null
;

-- insert to staging table here (as in your example)

-- lookup sales person key
update dbo.stg_sales 
 set sales_person_wid = p.sales_person_wid
from dbo.stg_sales as s , dbo.dim_salesperson as p
where s.salesperson_number =  p.salesperson_number ;

-- decide what to do with missing business keys
update dbo.stg_sales 
 set sales_person_wid = -2
where sales_person_wid is null ;


-- do similar for customer

-- now all keys in staging table are not null

-- load to fact table

However, usual technique is to assign special (unknown, n/a, error) business keys to transactions during extraction or maybe cleaning. In other words that special Error business key could have been assigned before the record landed into the staging table.

And in final note, late arriving dimension means that business key (salesperson_number) is known to the operational system, but the transaction (sale fact) made it to warehouse before the dimension's data. So, salesperson_number would be not null, but would not exist in the dimenson table. You than have to save this transaction somewhere and try to update the FK (salesperson_wid) once the record arrives in the dimension; a day later or so.