一个 ETL 数据清洗的小案例 的实现

浏览: 2455

Clipboard Image.png


建表脚本:

-- Create table
create table RT_CHECKPOINT_F
(
id NUMBER,
seq_no VARCHAR2(10),
check_point DATE
);

INSERT INTO RT_CHECKPOINT_F VALUES ( 1001,'0034', TO_DATE('2015-01-01','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1001,'0035', TO_DATE('2015-01-03','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1001,'0036', TO_DATE('2015-01-05','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1001,'0042', TO_DATE('2015-01-07','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1001,'0048', TO_DATE('2015-01-09','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1002,'0034', TO_DATE('2015-01-01','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1002,'0034', TO_DATE('2015-01-02','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1002,'0035', TO_DATE('2015-01-03','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1002,'0036', TO_DATE('2015-01-05','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1002,'0042', TO_DATE('2015-01-07','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1002,'0048', TO_DATE('2015-01-09','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1002,'0048', TO_DATE('2015-01-09','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1002,'0048', TO_DATE('2015-01-11','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1003,'0032', TO_DATE('2015-01-01','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1003,'0011', TO_DATE('2015-01-02','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1003,'0034', TO_DATE('2015-01-03','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1003,'0034', TO_DATE('2015-01-04','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1003,'0031', TO_DATE('2015-01-05','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1003,'0036', TO_DATE('2015-01-05','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1003,'0036', TO_DATE('2015-01-06','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1003,'0031', TO_DATE('2015-01-07','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1003,'0034', TO_DATE('2015-01-08','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1003,'0035', TO_DATE('2015-01-09','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1003,'0036', TO_DATE('2015-01-10','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1003,'0036', TO_DATE('2015-01-12','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1003,'0042', TO_DATE('2015-01-13','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1003,'0048', TO_DATE('2015-01-15','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1003,'0048', TO_DATE('2015-01-16','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1003,'0048', TO_DATE('2015-01-18','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1004,'0032', TO_DATE('2015-01-01','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1004,'0048', TO_DATE('2015-01-01','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1004,'0011', TO_DATE('2015-01-02','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1004,'0034', TO_DATE('2015-01-03','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1004,'0021', TO_DATE('2015-01-03','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1004,'0048', TO_DATE('2015-01-04','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1004,'0031', TO_DATE('2015-01-05','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1004,'0036', TO_DATE('2015-01-06','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1004,'0031', TO_DATE('2015-01-07','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1004,'0034', TO_DATE('2015-01-08','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1004,'0035', TO_DATE('2015-01-09','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1004,'0036', TO_DATE('2015-01-10','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1004,'0036', TO_DATE('2015-01-12','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1004,'0042', TO_DATE('2015-01-13','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1004,'0048', TO_DATE('2015-01-15','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1004,'0034', TO_DATE('2015-01-16','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1004,'0042', TO_DATE('2015-01-17','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1005,'0031', TO_DATE('2015-01-17','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1005,'0034', TO_DATE('2015-01-18','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1005,'0034', TO_DATE('2015-01-19','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1005,'0034', TO_DATE('2015-01-20','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1005,'0034', TO_DATE('2015-01-21','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1005,'0048', TO_DATE('2015-01-22','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1005,'0022', TO_DATE('2015-01-23','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1005,'0048', TO_DATE('2015-01-24','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1006,'0031', TO_DATE('2015-01-16','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1006,'0036', TO_DATE('2015-01-17','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1006,'0034', TO_DATE('2015-01-18','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1006,'0033', TO_DATE('2015-01-19','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1007,'0031', TO_DATE('2015-01-10','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1007,'0034', TO_DATE('2015-01-11','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1007,'0048', TO_DATE('2015-01-12','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1007,'0031', TO_DATE('2015-01-13','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1007,'0036', TO_DATE('2015-01-14','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1007,'0036', TO_DATE('2015-01-15','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1007,'0036', TO_DATE('2015-01-16','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1007,'0036', TO_DATE('2015-01-17','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1007,'0036', TO_DATE('2015-01-18','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1007,'0033', TO_DATE('2015-01-19','YYYY-MM-DD'));
INSERT INTO RT_CHECKPOINT_F VALUES ( 1007,'0048', TO_DATE('2015-01-20','YYYY-MM-DD'));
COMMIT;

Oracle 版本11.2.0 具体实现代码:

SELECT ID, SUM(END_DATE - START_DATE) DURATION
FROM (WITH t2 AS (SELECT t.*,
CASE
WHEN SEQ_NO = '0034' AND LAG(SEQ_NO, 1, '*')
OVER(PARTITION BY ID ORDER BY ID,
CHECK_POINT) NOT IN ('0034') OR
SEQ_NO = '0036' AND LAG(SEQ_NO)
OVER(PARTITION BY ID ORDER BY ID,
CHECK_POINT) = '0048' THEN
1
ELSE
0
END AS flag,

CASE
WHEN SEQ_NO = '0036' AND LEAD(SEQ_NO)
OVER(PARTITION BY ID ORDER BY ID,
CHECK_POINT) = '0048' AND
LAST_VALUE(DECODE(SEQ_NO,
'0036',
NULL,
SEQ_NO) IGNORE NULLS)
OVER(PARTITION BY ID ORDER BY ID,
CHECK_POINT ROWS BETWEEN UNBOUNDED
PRECEDING AND 1 PRECEDING) = '0034' THEN
1
ELSE
0
END AS flag2
FROM RT_CHECKPOINT_F t
WHERE t.SEQ_NO IN ('0034', '0036', '0048')), t3 AS (SELECT ID,
SEQ_NO,
CHECK_POINT,
FLAG,
0 flag3
FROM t2
UNION ALL
SELECT ID,
SEQ_NO,
CHECK_POINT,
FLAG2,
1 flag3
FROM t2
WHERE flag2 = 1)
SELECT ID, MIN(CHECK_POINT) START_DATE, MAX(CHECK_POINT) END_DATE
FROM (SELECT t3.*,
SUM(flag) OVER(PARTITION BY ID ORDER BY ID, CHECK_POINT, flag3) grp
FROM t3)
GROUP BY ID, GRP
ORDER BY ID, GRP)
GROUP BY ID
ORDER BY ID;


Clipboard Image.png

推荐 4
本文由 我是最优雅的杀手,不杀人专杀狗 创作,采用 知识共享署名-相同方式共享 3.0 中国大陆许可协议 进行许可。
转载、引用前需联系作者,并署名作者且注明文章出处。
本站文章版权归原作者及原出处所有 。内容为作者个人观点, 并不代表本站赞同其观点和对其真实性负责。本站是一个个人学习交流的平台,并不用于任何商业目的,如果有任何问题,请及时联系我们,我们将根据著作权人的要求,立即更正或者删除有关内容。本站拥有对此声明的最终解释权。

1 个评论

这么快就做出来啊。

要回复文章请先登录注册