维度模型数据仓库(五)- 需求增加列

浏览: 2239

 数据仓库中经常会遇到事实表以及维度表需求变更,最常见的就是增加一列数据。本篇先讨论如何需要增加一列数据,模式会发生怎样的改变。然后就销售事实表以及客户维度表分别添加一列,并在该列上运用SCD2的缓慢变化维度。

 修改数据库模式

        图(五)- 1-1 显示了修改后的模式,在它的customer_dim表和sales_order_fact表上增加了新列。customer_dim表增加的新列是shipping_address、shipping_zip_code、shipping_city和shipping_state。sales_order_fact表增加的新列是order_quantity。使用清单(五)-1-1里的SQL脚本修改数据库模式。

(此图片引用自blog)

图(五)- 1-1


use dw;
alter table dbo.Customer_dim add shipping_address varchar(50);
alter table dbo.Customer_dim add shipping_zip_code int;
alter table dbo.Customer_dim add shipping_city varchar(30);
alter table dbo.Customer_dim add shipping_state varchar(2);
alter table dbo.customer_stg add  shipping_address varchar(50);
alter table dbo.customer_stg add shipping_zip_code int;
alter table dbo.customer_stg add shipping_city varchar(30);
alter table dbo.customer_stg add shipping_state varchar(2);
alter table dbo.Sales_order_fact add  order_quantity int;
go
use source;
alter table dbo.Customer add shipping_address varchar(50);
alter table dbo.Customer add shipping_zip_code int;
alter table dbo.Customer add shipping_city varchar(30);
alter table dbo.Customer add shipping_state varchar(2);
alter table dbo.Sales_Order add order_quantity int;
go


 修改表模式之后,需修改初始化加载脚本以及定期加载脚本。但由于初始化数据已经加载进数据库,因此此处只修改定期加载脚本。


set ANSI_NULLS ON
set QUOTED_IDENTIFIER ON
go

ALTER procedure [dbo].[test] as
--定期加载数据
UPDATE cdc_time SET current_load = getdate() ;
declare @eff_date datetime;
select @eff_date=current_load from dbo.cdc_time;

begin

truncate table dbo.customer_stg;
truncate table dbo.product_stg;
--客户以及产品表整体拉取(客户表address实现scd2,name实现scd1
insert into dbo.customer_stg
(customer_number
,customer_name
,customer_street_address
,customer_zip_code
,customer_city
,customer_state
,shipping_address
,shipping_zip_code
,shipping_city
,shipping_state)
select Customer_number
,Customer_name
,Customer_street_address
,Customer_zip_code
,Customer_city
,Customer_state
,shipping_address
,shipping_zip_code
,shipping_city
,shipping_state
from source.dbo.Customer;


--新增数据
insert into dbo.Customer_dim
(Customer_number
,Customer_name
,Customer_street_address
,Customer_zip_code
,Customer_city
,Customer_state
,shipping_address
,shipping_zip_code
,shipping_city
,shipping_state
,effective_date
,expiry_date
,version)
select Customer_number
,Customer_name
,Customer_street_address
,Customer_zip_code
,Customer_city
,Customer_state
,shipping_address
,shipping_zip_code
,shipping_city
,shipping_state
,
@eff_date
,
convert(datetime,'2099-12-31',120)
,
1 from dbo.customer_stg where customer_number not in (
SELECT customer_number
FROM customer_dim x);
--更新数据处理
update customer_dim
set expiry_date=@eff_date
where exists (select 1 from customer_dim a,customer_stg b where a.customer_number=b.customer_number
and (a.Customer_street_address<>b.Customer_street_address
or a.shipping_address<>b.shipping_address
or a.shipping_zip_code<>b.shipping_zip_code
or a.shipping_city<>b.shipping_city
or a.shipping_state<>b.shipping_state)
AND expiry_date = convert(datetime,'2099-12-31',120)) ;

insert into customer_dim
(Customer_number
,Customer_name
,Customer_street_address
,Customer_zip_code
,Customer_city
,Customer_state
,shipping_address
,shipping_zip_code
,shipping_city
,shipping_state
,effective_date
,expiry_date
,version)
select b.Customer_number
,b.Customer_name
,b.Customer_street_address
,b.Customer_zip_code
,b.Customer_city
,b.Customer_state
,b.shipping_address
,b.shipping_zip_code
,b.shipping_city
,b.shipping_state
,
@eff_date
,
convert(datetime,'2099-12-31',120)
,a.version
+1
from customer_dim a ,customer_stg b
where a.customer_number=b.customer_number
and (a.Customer_street_address<>b.Customer_street_address
or a.shipping_address<>b.shipping_address
or a.shipping_zip_code<>b.shipping_zip_code
or a.shipping_city<>b.shipping_city
or a.shipping_state<>b.shipping_state)
AND EXISTS(
SELECT *
FROM customer_dim x
WHERE
b.customer_number
=x.customer_number
AND a.expiry_date = @eff_date)
AND NOT EXISTS (
SELECT *
FROM customer_dim y
WHERE
b.customer_number
= y.customer_number
AND y.expiry_date = convert(datetime,'2099-12-31',120)) ;


insert into dbo.product_stg
(Product_code
,product_name
,product_category)
select Product_code,Product_name,Product_category from source.dbo.Product;

--设置scd的截止时间以及生效时间

--新产品
insert into dbo.Product_dim
(Product_code
,product_name
,product_category
,effective_date
,expire_date
,version)
select Product_code,Product_name,Product_category
,
@eff_date
,
convert(datetime,'2099-12-31',120)
,
1 from dbo.product_stg where Product_code not in (select
Product_code
from dbo.Product_dim);

--产品表更新
update dbo.Product_dim
set expire_date=@eff_date
where exists(
select 1 from dbo.Product_dim a,dbo.product_stg b where
a.Product_code
=b.Product_code
and (a.product_name<>b.product_name
or a.product_category<>b.product_category)
and a.expire_date=convert(datetime,'2099-12-31',120));

insert into dbo.Product_dim
(Product_code
,product_name
,product_category
,effective_date
,expire_date
,version)
select b.product_code
, b.product_name
, b.product_category
,
@eff_date
,
'2099-12-31'
, a.version
+ 1
FROM
product_dim a
, product_stg b
WHERE
a.product_code
= b.product_code
AND ( a.product_name <> b.product_name
OR a.product_category <> b.product_category)
AND EXISTS(
SELECT *
FROM product_dim x
WHERE b.product_code = x.product_code
AND a.expire_date = @eff_date)
AND NOT EXISTS (
SELECT *
FROM product_dim y
WHERE b.product_code = y.product_code
AND y.expire_date = '2099-12-31') ;
--新数据


--新增的订单
--
装载订单维度,新增前一天的订单号
INSERT INTO order_dim (
order_number
, effective_date
, expiry_date)
SELECT
order_number
, order_date
,
'2200-01-01'
FROM source.sales_order, cdc_time
WHERE entry_date >= last_load AND entry_date < current_load ;

-- 装载事实表,新增前一天的订单
INSERT INTO sales_order_fact
SELECT
order_sk
, customer_sk
, product_sk
, date_sk
, order_amount
, order_quantity
FROM
source.sales_order a
, order_dim b
, customer_dim c
, product_dim d
, date_dim e
, cdc_time f
WHERE
a.order_number
= b.order_number
AND a.customer_number = c.customer_number
AND a.order_date >= c.effective_date
AND a.order_date < c.expiry_date
AND a.product_code = d.product_code
AND a.order_date >= d.effective_date
AND a.order_date < d.expiry_date
AND a.order_date = e.date
AND a.entry_date >= f.last_load AND a.entry_date < f.current_load ;

-- 更新时间戳表的last_load字段
UPDATE cdc_time SET last_load = current_load ;

end;


Kettle解决方案:

 

Kettle 需要更改的步骤:

   装载中间表

  装载维度表(客户维度修改)

 装载事实表

 

修改后的配置如下:

 

 

 

 

修改后数据:

 order 维度表加载数据:

37 NULL NULL NULL 1
38 22 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
39 23 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
40 24 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
41 25 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
42 26 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
43 27 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
44 28 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
45 30 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
46 31 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
47 32 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
48 33 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
49 34 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
50 35 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
51 36 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1

产品维度表:

62 NULL NULL NULL NULL NULL 1
63 1 Hard Disk Drive Storage 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
64 2 Floppy Drive Storage 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
65 3 LCD Panel Monitor 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
66 4 Keyboard Peripheral 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
 

客户维度表:

249 NULL NULL NULL NULL NULL NULL NULL NULL 1 NULL NULL NULL NULL
250 1 Really Large 7500 Louise Dr. 1 Mechanicsburg PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL 10001 NULL NULL
251 2 Small Stores 2500 Woodland St. 17055 Pittsburgh PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
252 3 Medium Retailers 1111 Ritter Rd. 17055 Pittsburgh PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
253 4 Good Companies 9500 Scott St. 17050 Mechanicsburg PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
254 5 Wonderful Shops 3333 Rossmoyne Rd. 17050 Mechanicsburg PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
255 6 Loyal Clients 7070 Ritter Rd. 17055 Pittsburgh PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
256 7 Distinguished Partners 9999 Scott St. 17050 Mechanicsburg PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL

 

事实表数据:

250 63 5564 38 1000 1000
251 64 5564 39 2000 2000
252 65 5564 40 3000 3000
253 66 5564 41 4000 4000
254 64 5564 42 1000 1000
255 64 5564 43 3000 3000
256 65 5564 44 5000 5000
250 63 5564 45 1000 1000
251 64 5564 46 2000 2000
252 65 5564 47 4000 4000
253 66 5564 48 6000 6000
254 63 5564 49 2500 2500
255 64 5564 50 5000 5000
256 65 5564 51 7500 7500

 

 执行如下脚本测试:

use source;
update dbo.Customer set shipping_city='shanghai' where Customer_number=1;

客户维度表数据变为:

249 NULL NULL NULL NULL NULL NULL NULL NULL 1 NULL NULL NULL NULL
250 1 Really Large 7500 Louise Dr. 1 Mechanicsburg PA 1900-01-01 00:00:00.000 2016-03-27 00:00:00.000 1 NULL 10001 NULL NULL
251 2 Small Stores 2500 Woodland St. 17055 Pittsburgh PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
252 3 Medium Retailers 1111 Ritter Rd. 17055 Pittsburgh PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
253 4 Good Companies 9500 Scott St. 17050 Mechanicsburg PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
254 5 Wonderful Shops 3333 Rossmoyne Rd. 17050 Mechanicsburg PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
255 6 Loyal Clients 7070 Ritter Rd. 17055 Pittsburgh PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
256 7 Distinguished Partners 9999 Scott St. 17050 Mechanicsburg PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
257 1 Really Large 7500 Louise Dr. 1 Mechanicsburg PA 2016-03-27 00:00:00.000 2200-01-01 00:00:00.000 2 NULL 10001 shanghai NULL
推荐 1
本文由 doudou1 创作,采用 知识共享署名-相同方式共享 3.0 中国大陆许可协议 进行许可。
转载、引用前需联系作者,并署名作者且注明文章出处。
本站文章版权归原作者及原出处所有 。内容为作者个人观点, 并不代表本站赞同其观点和对其真实性负责。本站是一个个人学习交流的平台,并不用于任何商业目的,如果有任何问题,请及时联系我们,我们将根据著作权人的要求,立即更正或者删除有关内容。本站拥有对此声明的最终解释权。

0 个评论

要回复文章请先登录注册