The rxfastforest algorithm case of kaggle

发表: 2018-03-19 浏览: 1523

数据分析 R语言

苏高生，西南财经大学统计学硕士毕业，现就职于中国电信，主要负责企业存量客户大数据分析、数据建模。
研究方向：机器学习，最喜欢的编程语言：R语言，没有之一。
E-mail：sugs01@outlook.com

往期回顾：Xgboost算法——Kaggle案例

零、案例背景介绍与建模思路说明

1.背景介绍

本案例使用的数据为kaggle中“Santander Customer Satisfaction”比赛的数据。此案例为不平衡二分类问题，目标为最大化auc值（ROC曲线下方面积）。竞赛题目链接为：https://www.kaggle.com/c/santander-customer-satisfaction 。目前此比赛已经结束。

2.建模思路

此文档采用R中的mlr包中的smote算法来处理数据类别不平衡的问题，用Microsoft R Server(专业版R)中的RevoScaleR包中rxFastForest函数进行随机森林建模。采用mlr包调用randomforest包的randomForest函数建模，进行并行运算，效率依然低下，不能满足正常工作；因此需要调用RevoScaleR包的函数，rxDForest可以进行随机森林建模，但是效率远低于rxFastForest函数，因此本文档采用rxFastForest函数。由于随机森林函数效率较低，因此此文档所读取的数据为“ http://rpubs.com/yisu/xgboost_mlr_kaggle_case_oversample ” 文档中处理后的xgb_tr3,xgb_te3数据（提取信约95%的信息增益）；故而本文档直接进入建模部分，不再做数据探索与处理。

1) 读取数据；

2) 并行运算：由于rxFastForest函数可以通过设置相应参数进行并行运算，因此不再调用doParallel与foreach包进行并行运算；

3) 特征选择：本文档不再处理；

4) 调参：逐步调试rxFastForest函数的参数，并多次调试，直到满意为止；

5) 集成预测结果：在每个参数的适宜范围内随机抽取参数值构建rxFastForest模型，并将多个模型进行集成，输出预测结果；本案例所用程序输出结果的ROC值为0.829533,已超过Private Leaderboard排名第一的结果。

一、读取数据

rx_tr <- rxImport('C:/Users/Administrator/Documents/kaggle/scs_rf/rf_tr3.csv')
rx_te <- rxImport('C:/Users/Administrator/Documents/kaggle/scs_rf/rf_te3.csv')

二、算法

3.建模准备

1）模型公式

rx_formula <- paste0(
   'TARGET ~ ',
   paste0(colnames(rx_tr)[1:(rx_tr_ncol - 1)], collapse = ' + '),
   collapse = ''
)

2）加装包

library(mlr) ## 调用smote函数
library(parallelMap) ## 并行运算
parallelStartSocket(4)
library(pROC) ## 计算auc值
library(caret) ## 十折交叉验证
library(ggplot2) ## 调参时绘图查看参数不同值对应的auc值，以确定最优参数

4.调试parms参数中的rate与nn参数（smote处理类别不平衡）

1）rate与nn参数

grid_search <- expand.grid(
rate = seq(5, 50, 5),
nn = seq(5, 17, 2)
)

2）构建perf矩阵放置auc值

perf_rate_1 <- matrix(nrow = nrow(grid_search), ncol = 10) ## 十折交叉验证

3）十折交叉验证

set.seed(1)folds <- createFolds(y=1:rx_tr_nrow, k=10)

4）计算auc值—由于rxFastForest自动调用并行运算，因此此处使用循环

for (j in 1:nrow(grid_search)){
   for (i in 1:10){
   # 分割数据
   rx_tr_1 <- rx_tr[-folds[[i]], ]
   rx_tr_2 <- rx_tr[ folds[[i]], ]
   # smote 抽样
   rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
   rx_tr_1_task_smote <- smote(
   rx_tr_1_task,
   rate = grid_search[j, 'rate'],
   nn = grid_search[j, 'nn']
   )
        rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
   # 训练
   rx_tr_mod <- rxFastForest(
   formula = rx_formula,
   data = rx_tr_1_2,
   numTrees = 500,
   trainThreads = 4
   )
   # 预测
   rx_tr_pre <- rxPredict(
   rx_tr_mod,
   rx_tr_2
   )
   # 修改数据类型以计算auc值
   rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
   # 计算auc值
   rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
   perf_rate_1[j, i] <- rx_tr_roc$auc }}

perf_rate_1_f <- apply(perf_rate_1, 1, mean) ## 十折交叉验证的平均值

## 绘图
ggplot(data = grid_search, aes(x = rate, y = perf)) +
   geom_point() +
   facet_wrap(facets = ~ nn, ncol = 3)

5）结论：rate = 5, nn = 9时最优

5.继续调试parms参数中的rate参数（类别不平衡）

1）十折交叉验证

set.seed(2)
folds <- createFolds(y=1:rx_tr_nrow, k=10)

2）rate参数

set.seed(2)
folds <- createFolds(y=1:rx_tr_nrow, k=10)

3）构建perf矩阵放置auc值

set.seed(2)folds <- createFolds(y=1:rx_tr_nrow, k=10)

4）计算auc值

for (j in 1:nrow(grid_search)){
   for (i in 1:10){
   # 分割数据
   rx_tr_1 <- rx_tr[-folds[[i]], ]
   rx_tr_2 <- rx_tr[ folds[[i]], ]
   # smote 抽样
   rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
   rx_tr_1_task_smote <- smote(
   rx_tr_1_task,
   rate = grid_search[j, 'rate'],
   nn = grid_search[j, 'nn']
   )
   rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
         # 训练
   rx_tr_mod <- rxFastForest(
   formula = rx_formula,
   data = rx_tr_1_2,
   numTrees = 500,
   trainThreads = 4
   )
   # 预测
   rx_tr_pre <- rxPredict(
   rx_tr_mod,
   rx_tr_2
   )
        # 修改数据类型以计算auc值
   rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
   # 计算auc值
   rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
   perf_rate_2[j, i] <- rx_tr_roc$auc }}

perf_rate_2_f <- apply(perf_rate_2, 1, mean) ## 十折交叉验证平均值

## 绘图grid_search$perf <- perf_rate_2_f
ggplot(data = grid_search, aes(x = rate, y = perf)) +
   geom_point()

5）结论：rate = 4, nn = 9时最优

6.调试parms参数中的numTrees参数

1）十折交叉验证

set.seed(3)folds <- createFolds(y=1:rx_tr_nrow, k=10)

2）numTrees参数

grid_search <- expand.grid(
numTrees = seq(100, 1000, 100))

3）构建perf矩阵放置auc值

perf_numTrees_1 <- matrix(nrow = nrow(grid_search), ncol = 10)

4）计算auc值

for (j in 1:nrow(grid_search)){
   for (i in 1:10){
   # 分割数据
   rx_tr_1 <- rx_tr[-folds[[i]], ]
   rx_tr_2 <- rx_tr[ folds[[i]], ]
   # smote 抽样
   rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
   rx_tr_1_task_smote <- smote(
   rx_tr_1_task,
   rate = 4,
   nn = 9
   )
   rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
        # 训练
   rx_tr_mod <- rxFastForest(
   formula = rx_formula,
   data = rx_tr_1_2,
   numTrees = grid_search[j, 'numTrees'],
   trainThreads = 4
   )
   # 预测
   rx_tr_pre <- rxPredict(
   rx_tr_mod,
   rx_tr_2
   )
   # 修改数据类型以计算auc值
   rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
# 计算auc值
   rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
   perf_numTrees_1[j, i] <- rx_tr_roc$auc
   }
}

perf_numTrees_1_f <- apply(perf_numTrees_1, 1, mean)

grid_search$perf <- perf_numTrees_1_fggplot(data = grid_search, aes(x = numTrees, y = perf)) + geom_point()

5）结论：numTrees=600时最优

7.调试parms参数中的numLeaves参数

1）十折交叉验证

set.seed(4)folds <- createFolds(y=1:rx_tr_nrow, k=10)

2）numLeaves参数

grid_search <- expand.grid(
numTrees = 600,
numLeaves = 2 ^ (5:9)
)

3）构建perf矩阵放置auc值

perf_numLeaves_1 <- matrix(nrow = nrow(grid_search), ncol = 10)

4）计算auc值

for (j in 1:nrow(grid_search)){
   for (i in 1:10){
   # 分割数据
   rx_tr_1 <- rx_tr[-folds[[i]], ]
   rx_tr_2 <- rx_tr[ folds[[i]], ]
   # smote 抽样
   rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
   rx_tr_1_task_smote <- smote(
   rx_tr_1_task,
   rate = 4,
   nn = 9
   )
   rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
   rx_tr_mod <- rxFastForest(
   formula = rx_formula,
   data = rx_tr_1_2,
   numTrees = grid_search[j, 'numTrees'],
   numLeaves = grid_search[j, 'numLeaves'],
   trainThreads = 4
   )
   # 预测
   rx_tr_pre <- rxPredict(
   rx_tr_mod,
   rx_tr_2
   )
   # 修改数据类型以计算auc值
   rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
   # 计算auc值
   rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
   perf_numLeaves_1[j, i] <- rx_tr_roc$auc }}

perf_numLeaves_1_f <- apply(perf_numLeaves_1, 1, mean)

grid_search$perf <- perf_numLeaves_1_f
ggplot(data = grid_search, aes(x = numLeaves, y = perf)) +
   geom_point()

5）结论：numLeaves=2^7时最优

8.调试parms参数中的minSplit参数

1）十折交叉验证

set.seed(5)folds <- createFolds(y=1:rx_tr_nrow, k=10)

2）minSplit参数

grid_search <- expand.grid(
   numTrees=500,
   numLeaves = 2^7,
   minSplit = seq(5, 30, 5)
)

3）构建perf矩阵放置auc值

perf_minSplit_1 <- matrix(nrow = nrow(grid_search), ncol = 10)

4）计算auc值

for (j in 1:nrow(grid_search)){
   for (i in 1:10){
   # 分割数据
   rx_tr_1 <- rx_tr[-folds[[i]], ]
   rx_tr_2 <- rx_tr[ folds[[i]], ]
   # smote 抽样
   rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
   rx_tr_1_task_smote <- smote(
   rx_tr_1_task,
   rate = 4,
   nn = 9
   )
   rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
   # 训练
   rx_tr_mod <- rxFastForest(
   formula = rx_formula,
   data = rx_tr_1_2,
   numTrees = grid_search[j, 'numTrees'],
   numLeaves = grid_search[j, 'numLeaves'],
   minSplit = grid_search[j, 'minSplit'],
   trainThreads = 4
   )
   # 预测
   rx_tr_pre <- rxPredict(
   rx_tr_mod,
   rx_tr_2
   )
   # 修改数据类型以计算auc值
   rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
# 计算auc值
   rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
   perf_minSplit_1[j, i] <- rx_tr_roc$auc
}
}

perf_minSplit_1_f <- apply(perf_minSplit_1, 1, mean, na.rm = TRUE)

grid_search$perf <- perf_minSplit_1_fggplot(data = grid_search, aes(x = minSplit, y = perf)) +
   geom_point()

5）结论：minSplit=25 时最优

9.调试parms参数中的exampleFraction参数

1）十折交叉验证

set.seed(6)folds <- createFolds(y=1:rx_tr_nrow, k=10)

2）exampleFraction参数

grid_search <- expand.grid(
   numTrees=500,
   numLeaves = 2^7,
   minSplit = 25,
   exampleFraction = seq(.55, .9, .05)
   )

3）构建perf矩阵放置auc值

perf_exampleFraction_1 <- matrix(nrow = nrow(grid_search), ncol = 10)

4）计算auc值

for (j in 1:nrow(grid_search)){
   for (i in 1:10){
   # 分割数据
   rx_tr_1 <- rx_tr[-folds[[i]], ]
   rx_tr_2 <- rx_tr[ folds[[i]], ]
   # smote 抽样
   rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
   rx_tr_1_task_smote <- smote(
   rx_tr_1_task,
   rate = 4,
   nn = 9
   )
   rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
   rx_tr_mod <- rxFastForest(
   formula = rx_formula,
   data = rx_tr_1_2,
   numTrees = grid_search[j, 'numTrees'],
   numLeaves = grid_search[j, 'numLeaves'],
   minSplit = grid_search[j, 'minSplit'],
   exampleFraction = grid_search[j, 'exampleFraction'],
   trainThreads = 4
   )
   # 预测
   rx_tr_pre <- rxPredict(
   rx_tr_mod,
   rx_tr_2
   )
   # 修改数据类型以计算auc值
   rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
   # 计算auc值
   rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
   perf_exampleFraction_1[j, i] <- rx_tr_roc$auc }}

perf_exampleFraction_1_f <- apply(perf_exampleFraction_1, 1, mean, na.rm = TRUE)

grid_search$perf <- perf_exampleFraction_1_f
ggplot(data = grid_search, aes(x = exampleFraction, y = perf)) +
   geom_point()

5）结论：exampleFraction=.6 时最优[.55, .65]比较好，但是[.55, 1]区间变化不大

10.调试parms参数中的featureFraction参数

1）十折交叉验证

set.seed(7)folds <- createFolds(y=1:rx_tr_nrow, k=10)

2）featureFraction参数

grid_search <- expand.grid(
   numTrees=500,
   numLeaves = 2^7,
   minSplit = 25,
   exampleFraction = .6,
   featureFraction = seq(.5, .9, .05)
   )

3）构建perf矩阵放置auc值

perf_featureFraction_1 <- matrix(nrow = nrow(grid_search), ncol = 10)

4）计算auc值

for (j in 1:nrow(grid_search)){
   for (i in 1:10){
   # 分割数据
   rx_tr_1 <- rx_tr[-folds[[i]], ]
   rx_tr_2 <- rx_tr[ folds[[i]], ]
   # smote 抽样
   rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
   rx_tr_1_task_smote <- smote(
   rx_tr_1_task,
   rate = 4,
   nn = 9
   )
   rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
   rx_tr_mod <- rxFastForest(
formula = rx_formula,
   data = rx_tr_1_2,
   numTrees = grid_search[j, 'numTrees'],
   numLeaves = grid_search[j, 'numLeaves'],
   minSplit = grid_search[j, 'minSplit'],
   exampleFraction = grid_search[j, 'exampleFraction'],
   featureFraction = grid_search[j, 'featureFraction'],
   trainThreads = 4
   )
   # 预测
   rx_tr_pre <- rxPredict(
   rx_tr_mod,
   rx_tr_2
   )
   # 修改数据类型以计算auc值
   rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
# 计算auc值
   rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
   perf_featureFraction_1[j, i] <- rx_tr_roc$auc }}
        perf_featureFraction_1_f <- apply(perf_featureFraction_1, 1, mean, na.rm = TRUE)
        grid_search$perf <- perf_featureFraction_1_f
        ggplot(data = grid_search, aes(x = featureFraction, y = perf)) +                          geom_point()

5）结论：featureFraction=.85 时最优[.75, .9]比较好

11.调试parms参数中的splitFraction参数

1）十折交叉验证

set.seed(8)folds <- createFolds(y=1:rx_tr_nrow, k=10)

2）splitFraction参数

grid_search <- expand.grid(
   numTrees=500,
   numLeaves = 2^7,
   minSplit = 25,
   exampleFraction = .6,
   featureFraction = .85,
   splitFraction = seq(.5, .95, .05)
)

3）构建perf矩阵放置auc值

perf_splitFraction_1 <- matrix(nrow = nrow(grid_search), ncol = 10)

4）计算auc值

for (j in 1:nrow(grid_search)){
   for (i in 1:10){
   # 分割数据
   rx_tr_1 <- rx_tr[-folds[[i]], ]
   rx_tr_2 <- rx_tr[ folds[[i]], ]
   # smote 抽样
   rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
   rx_tr_1_task_smote <- smote(
   rx_tr_1_task,
   rate = 4,
   nn = 9
   )
   rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
   rx_tr_mod <- rxFastForest(
   formula = rx_formula,
   data = rx_tr_1_2,
   numTrees = grid_search[j, 'numTrees'],
   numLeaves = grid_search[j, 'numLeaves'],
   minSplit = grid_search[j, 'minSplit'],
   exampleFraction = grid_search[j, 'exampleFraction'],
   featureFraction = grid_search[j, 'featureFraction'],
   splitFraction = grid_search[j, 'splitFraction'],
   trainThreads = 4
   )
# 预测
   rx_tr_pre <- rxPredict(
   rx_tr_mod,
   rx_tr_2
   )
   # 修改数据类型以计算auc值
   rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
   # 计算auc值
   rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
   perf_splitFraction_1[j, i] <- rx_tr_roc$auc
   }
perf_splitFraction_1_f <- apply(perf_splitFraction_1, 1, mean, na.rm = TRUE)
grid_search$perf <- perf_splitFraction_1_f
ggplot(data = grid_search, aes(x = splitFraction, y = perf)) +
   geom_point()

5)结论：splitFraction=.5 时最优,但是变化细微

12.调试parms参数中的numBins参数

1）十折交叉验证

set.seed(9)folds <- createFolds(y=1:rx_tr_nrow, k=10)

2）numBins参数

grid_search <- expand.grid(
   numTrees=500,
   numLeaves = 2^7,
   minSplit = 25,
   exampleFraction = .6,
   featureFraction = .85,
   splitFraction = .5,
   numBins = seq(105, 505, 50)
)

3）构建perf矩阵放置auc值

perf_numBins_1 <- matrix(nrow = nrow(grid_search), ncol = 10)
for (j in 1:nrow(grid_search)){
   for (i in 1:10){
   # 分割数据
   rx_tr_1 <- rx_tr[-folds[[i]], ]
   rx_tr_2 <- rx_tr[ folds[[i]], ]
   # smote 抽样
   rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
   rx_tr_1_task_smote <- smote(
   rx_tr_1_task,
   rate = 4,
   nn = 9
   )
   rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
   rx_tr_mod <- rxFastForest(
   formula = rx_formula,
   data = rx_tr_1_2,
   numTrees = grid_search[j, 'numTrees'],
   numLeaves = grid_search[j, 'numLeaves'],
   minSplit = grid_search[j, 'minSplit'],
   exampleFraction = grid_search[j, 'exampleFraction'],
   featureFraction = grid_search[j, 'featureFraction'],
   splitFraction = grid_search[j, 'splitFraction'],
   numBins = grid_search[j, 'numBins'],
   trainThreads = 4
   )
   # 预测
   rx_tr_pre <- rxPredict(
   rx_tr_mod,
   rx_tr_2
   )
   # 修改数据类型以计算auc值
   rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
# 计算auc值
   rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
   perf_numBins_1[j, i] <- rx_tr_roc$auc }}
        perf_numBins_1_f <- apply(perf_numBins_1, 1, mean, na.rm = TRUE)
        grid_search$perf <- perf_numBins_1_fggplot(data = grid_search, aes(x = numBins, y = perf)) +
   geom_point() +
   geom_smooth()

5)结论：numBins=350 时最优,但是变化细微

13.调试parms参数中的firstUsePenalty参数

1）十折交叉验证

set.seed(10)folds <- createFolds(y=1:rx_tr_nrow, k=10)

2）firstUsePenalty参数

grid_search <- expand.grid(
   numTrees=500,
   numLeaves = 2^7,
   minSplit = 25,
   exampleFraction = .6,
   featureFraction = .85,
   splitFraction = .5,
   numBins = 350,
   firstUsePenalty = seq(0, 1, .2)
)

3）构建perf矩阵放置auc值

perf_firstUsePenalty_1 <- matrix(nrow = nrow(grid_search), ncol = 10)

4）计算auc值

for (j in 1:nrow(grid_search)){
   for (i in 1:10){
   # 分割数据
   rx_tr_1 <- rx_tr[-folds[[i]], ]
   rx_tr_2 <- rx_tr[ folds[[i]], ]
   # smote 抽样
   rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
   rx_tr_1_task_smote <- smote(
   rx_tr_1_task,
   rate = 4,
   nn = 9
   )
   rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
   rx_tr_mod <- rxFastForest(
   formula = rx_formula,
   data = rx_tr_1_2,
   numTrees = grid_search[j, 'numTrees'],
   numLeaves = grid_search[j, 'numLeaves'],
   minSplit = grid_search[j, 'minSplit'],
   exampleFraction = grid_search[j, 'exampleFraction'],
   featureFraction = grid_search[j, 'featureFraction'],
   splitFraction = grid_search[j, 'splitFraction'],
   numBins = grid_search[j, 'numBins'],
   firstUsePenalty = grid_search[j, 'firstUsePenalty'],
   trainThreads = 4
   )
   # 预测
   rx_tr_pre <- rxPredict(
   rx_tr_mod,
   rx_tr_2
   )
# 修改数据类型以计算auc值
   rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
   # 计算auc值
   rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
   perf_firstUsePenalty_1[j, i] <- rx_tr_roc$auc
   }
}
perf_firstUsePenalty_1_f <- apply(perf_firstUsePenalty_1, 1, mean, na.rm = TRUE)
grid_search$perf <- perf_firstUsePenalty_1_f
ggplot(data = grid_search, aes(x = firstUsePenalty, y = perf)) +
   geom_point() +
   geom_smooth()

5)结论：firstUsePenalty=1.2 时最优

14.调试parms参数中的gainConfLevel参数

1）十折交叉验证

set.seed(11)folds <- createFolds(y=1:rx_tr_nrow, k=10)

2）gainConfLevel参数

grid_search <- expand.grid(
   numTrees=500,
   numLeaves = 2^7,
   minSplit = 25,
   exampleFraction = .6,
   featureFraction = .85,
   splitFraction = .5,
   numBins = 350,
   firstUsePenalty = 1.2,
   gainConfLevel = seq(.01, .1, .01)
)

3）构建perf矩阵放置auc值

perf_gainConfLevel_2 <- matrix(nrow = nrow(grid_search), ncol = 10)

4）计算auc值

for (j in 1:nrow(grid_search)){
   for (i in 1:10){
   # 分割数据
   rx_tr_1 <- rx_tr[-folds[[i]], ]
   rx_tr_2 <- rx_tr[ folds[[i]], ]
   # smote 抽样
   rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
   rx_tr_1_task_smote <- smote(
   rx_tr_1_task,
   rate = 4,
   nn = 9
   )
   rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
   rx_tr_mod <- rxFastForest(
   formula = rx_formula,
   data = rx_tr_1_2,
   numTrees = grid_search[j, 'numTrees'],
   numLeaves = grid_search[j, 'numLeaves'],
   minSplit = grid_search[j, 'minSplit'],
   exampleFraction = grid_search[j, 'exampleFraction'],
   featureFraction = grid_search[j, 'featureFraction'],
   splitFraction = grid_search[j, 'splitFraction'],
   numBins = grid_search[j, 'numBins'],
   firstUsePenalty = grid_search[j, 'firstUsePenalty'],
   gainConfLevel = grid_search[j, 'gainConfLevel'],
   trainThreads = 4
   )
   # 预测
   rx_tr_pre <- rxPredict(
   rx_tr_mod,
   rx_tr_2
)
   # 修改数据类型以计算auc值
   rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
# 计算auc值
   rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
   perf_gainConfLevel_2[j, i] <- rx_tr_roc$auc }}

perf_gainConfLevel_2_f <- apply(perf_gainConfLevel_2, 1, mean, na.rm = TRUE)

grid_search$perf <- perf_gainConfLevel_2_f
ggplot(data = grid_search, aes(x = gainConfLevel, y = perf)) +
   geom_point() +
   geom_smooth()

5)结论：gainConfLevel=0.05 时最优

15.再次调试parms参数中的numTrees参数

1）十折交叉验证

set.seed(12)folds <- createFolds(y=1:rx_tr_nrow, k=10)

2）numTrees参数

grid_search <- expand.grid(
   numTrees=seq(200, 600, 100),
   numLeaves = 2^7,
   minSplit = 25,
   exampleFraction = .6,
   featureFraction = .85,
   splitFraction = .5,
   numBins = 350,
   firstUsePenalty = 1.2,
   gainConfLevel = .05
)

3）构建perf矩阵放置auc值

perf_numTrees_2 <- matrix(nrow = nrow(grid_search), ncol = 10)

4）计算auc值

for (j in 1:nrow(grid_search)){ for (i in 1:10){
   # 分割数据
   rx_tr_1 <- rx_tr[-folds[[i]], ]
   rx_tr_2 <- rx_tr[ folds[[i]], ]
   # smote 抽样
   rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
   rx_tr_1_task_smote <- smote(
   rx_tr_1_task,
   rate = 4,
nn = 9
   )
   rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
   rx_tr_mod <- rxFastForest(
   formula = rx_formula,
   data = rx_tr_1_2,
   numTrees = grid_search[j, 'numTrees'],
   numLeaves = grid_search[j, 'numLeaves'],
   minSplit = grid_search[j, 'minSplit'],
   exampleFraction = grid_search[j, 'exampleFraction'],
   featureFraction = grid_search[j, 'featureFraction'],
   splitFraction = grid_search[j, 'splitFraction'],
   numBins = grid_search[j, 'numBins'],
   firstUsePenalty = grid_search[j, 'firstUsePenalty'],
   gainConfLevel = grid_search[j, 'gainConfLevel'],
   trainThreads = 4
   )
   # 预测
   rx_tr_pre <- rxPredict(
   rx_tr_mod,
   rx_tr_2
   )
   # 修改数据类型以计算auc值
   rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
# 计算auc值
   rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
   perf_numTrees_2[j, i] <- rx_tr_roc$auc }}
perf_numTrees_2_f <- apply(perf_numTrees_2, 1, mean, na.rm = TRUE)
grid_search$perf <- perf_numTrees_2_f
ggplot(data = grid_search, aes(x = numTrees, y = perf)) +
   geom_point() +
   geom_smooth()

5)结论：numTrees=600 时最优

结论：由于参数不发生变化，因此停止训练

三、集成学习

0)参数

set.seed(1)grid_search <- expand.grid(
   numTrees=sample(550:650, 10, replace = FALSE),
   numLeaves = sample(124:132, 4, replace = FALSE),
   minSplit = sample(24:26, 2, replace = FALSE),
   exampleFraction = sample(550:650, 10, replace = FALSE) / 1000,
   featureFraction = sample(750:900, 10, replace = FALSE) / 1000,
   splitFraction = sample(45:55, 5, replace = FALSE) / 100,
   numBins = sample(320:380, 5, replace = FALSE),
   firstUsePenalty = sample(115:125, 5, replace = FALSE) / 100,
   gainConfLevel = sample(45:55, 5, replace = FALSE) / 1000)
sample_ind <- sample(dim(grid_search)[1], 100, replace = FALSE)
grid_search2 <- grid_search[sample_ind, ]
rm(grid_search)

1) 放置结果

rxfastforest.pred <- list()

2)训练

for (i in 1:nrow(grid_search2)[1]){
   # smote 抽样
   rx_tr_task <- makeClassifTask(data = rx_tr, target = 'TARGET')
   rx_tr_task_smote <- smote(
   rx_tr_task,
   rate = 4,
   nn = 9
   )
   rx_tr_2 <- getTaskData(rx_tr_task_smote)
     # 训练
   rx_tr_mod <- rxFastForest(
   formula = rx_formula,
   data = rx_tr_2,
   numTrees = grid_search2[i, 'numTrees'],
   numLeaves = grid_search2[i, 'numLeaves'],
   minSplit = grid_search2[i, 'minSplit'],
   exampleFraction = grid_search2[i, 'exampleFraction'],
   featureFraction = grid_search2[i, 'featureFraction'],
   splitFraction = grid_search2[i, 'splitFraction'],
   numBins = grid_search2[i, 'numBins'],
   firstUsePenalty = grid_search2[i, 'firstUsePenalty'],
   gainConfLevel = grid_search2[i, 'gainConfLevel'],
   trainThreads = 4
   )
   # 预测
   rx_tr_pre <- rxPredict(
   rx_tr_mod,
   rx_te
   )
   rxfastforest.pred[[i]] <- rx_tr_pre$Probability.1
}

3)结果

rxfastforest.pred2 <- matrix(unlist(rxfastforest.pred), ncol = 100)
rxfastforest.pred3 <- data.frame(prob1 = apply(rxfastforest.pred2, 1, mean))

4)输出

write.csv(rxfastforest.pred3, "C:/Users/Administrator/Documents/kaggle/scs_rf/rxfastforest.pred1.csv")

　往期精彩内容整理合集　
2017年R语言发展报告（国内）
R语言中文社区历史文章整理（作者篇）
R语言中文社区历史文章整理（类型篇）

公众号后台回复关键字即可学习
回复 R                  R语言快速入门及数据挖掘
回复 Kaggle案例  Kaggle十大案例精讲（连载中）
回复文本挖掘   手把手教你做文本挖掘
回复可视化   R语言可视化在商务场景中的应用
回复大数据         大数据系列免费视频教程
回复量化投资      张丹教你如何用R语言量化投资
回复用户画像      京东大数据，揭秘用户画像
回复数据挖掘     常用数据挖掘算法原理解释与应用
回复机器学习人工智能系列之机器学习与实践
回复爬虫            R语言爬虫实战案例分享

0 个评论

要回复文章请先登录或注册