The rxfastforest algorithm case of kaggle

浏览: 1523

苏高生,西南财经大学统计学硕士毕业,现就职于中国电信,主要负责企业存量客户大数据分析、数据建模。

研究方向:机器学习,最喜欢的编程语言:R语言,没有之一。

E-mail:sugs01@outlook.com

往期回顾:Xgboost算法——Kaggle案例



零、案例背景介绍与建模思路说明

1.背景介绍

本案例使用的数据为kaggle中“Santander Customer Satisfaction”比赛的数据。此案例为不平衡二分类问题,目标为最大化auc值(ROC曲线下方面积)。竞赛题目链接为:https://www.kaggle.com/c/santander-customer-satisfaction 。目前此比赛已经结束。

2.建模思路

此文档采用R中的mlr包中的smote算法来处理数据类别不平衡的问题,用Microsoft R Server(专业版R)中的RevoScaleR包中rxFastForest函数进行随机森林建模。采用mlr包调用randomforest包的randomForest函数建模,进行并行运算,效率依然低下,不能满足正常工作;因此需要调用RevoScaleR包的函数,rxDForest可以进行随机森林建模,但是效率远低于rxFastForest函数,因此本文档采用rxFastForest函数。由于随机森林函数效率较低,因此此文档所读取的数据为“ http://rpubs.com/yisu/xgboost_mlr_kaggle_case_oversample ” 文档中处理后的xgb_tr3,xgb_te3数据(提取信约95%的信息增益);故而本文档直接进入建模部分,不再做数据探索与处理。

1) 读取数据;

2) 并行运算:由于rxFastForest函数可以通过设置相应参数进行并行运算,因此不再调用doParallel与foreach包进行并行运算;

3) 特征选择:本文档不再处理;

4) 调参:逐步调试rxFastForest函数的参数,并多次调试,直到满意为止;

5) 集成预测结果:在每个参数的适宜范围内随机抽取参数值构建rxFastForest模型,并将多个模型进行集成,输出预测结果;本案例所用程序输出结果的ROC值为0.829533,已超过Private Leaderboard排名第一的结果。

一、读取数据

rx_tr <- rxImport('C:/Users/Administrator/Documents/kaggle/scs_rf/rf_tr3.csv')

rx_te <- rxImport('C:/Users/Administrator/Documents/kaggle/scs_rf/rf_te3.csv')

二、算法

3.建模准备

1)模型公式

rx_formula <- paste0(

    'TARGET ~ ',

    paste0(colnames(rx_tr)[1:(rx_tr_ncol - 1)], collapse = ' + '),

    collapse = ''

)

2)加装包

library(mlr) ## 调用smote函数

library(parallelMap) ## 并行运算

parallelStartSocket(4)

library(pROC) ## 计算auc值

library(caret) ## 十折交叉验证

library(ggplot2) ## 调参时绘图查看参数不同值对应的auc值,以确定最优参数

4.调试parms参数中的rate与nn参数(smote处理类别不平衡)

1)rate与nn参数

grid_search <- expand.grid(

    rate = seq(5, 50, 5),

    nn = seq(5, 17, 2)

)

2)构建perf矩阵放置auc值

perf_rate_1 <- matrix(nrow = nrow(grid_search), ncol = 10) ## 十折交叉验证

3)十折交叉验证

set.seed(1)folds <- createFolds(y=1:rx_tr_nrow, k=10)

4)计算auc值—由于rxFastForest自动调用并行运算,因此此处使用循环

for (j in 1:nrow(grid_search)){

    for (i in 1:10){

        # 分割数据

        rx_tr_1 <- rx_tr[-folds[[i]], ]

        rx_tr_2 <- rx_tr[ folds[[i]], ]

        # smote 抽样

        rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')

        rx_tr_1_task_smote <- smote(

            rx_tr_1_task,

            rate = grid_search[j, 'rate'],

            nn = grid_search[j, 'nn']

        )

        rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)

                # 训练

        rx_tr_mod <- rxFastForest(

            formula = rx_formula,

            data = rx_tr_1_2,

            numTrees = 500,

            trainThreads = 4

        )

        # 预测

        rx_tr_pre <- rxPredict(

            rx_tr_mod,

            rx_tr_2

        )

        # 修改数据类型以计算auc值

        rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)

                # 计算auc值

        rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)

        perf_rate_1[j, i] <- rx_tr_roc$auc    }}


perf_rate_1_f <- apply(perf_rate_1, 1, mean) ## 十折交叉验证的平均值


## 绘图

ggplot(data = grid_search, aes(x = rate, y = perf)) +

    geom_point() +

    facet_wrap(facets = ~ nn, ncol = 3)

5)结论:rate = 5, nn = 9时最优

5.继续调试parms参数中的rate参数(类别不平衡)

1)十折交叉验证

set.seed(2)

folds <- createFolds(y=1:rx_tr_nrow, k=10)  

2)rate参数

set.seed(2)

folds <- createFolds(y=1:rx_tr_nrow, k=10)  

3)构建perf矩阵放置auc值

set.seed(2)folds <- createFolds(y=1:rx_tr_nrow, k=10)  

4)计算auc值

for (j in 1:nrow(grid_search)){

    for (i in 1:10){

        # 分割数据

        rx_tr_1 <- rx_tr[-folds[[i]], ]

        rx_tr_2 <- rx_tr[ folds[[i]], ]

        # smote 抽样

        rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')

        rx_tr_1_task_smote <- smote(

            rx_tr_1_task,

            rate = grid_search[j, 'rate'],

            nn = grid_search[j, 'nn']

        )

        rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)        

         # 训练

        rx_tr_mod <- rxFastForest(

            formula = rx_formula,

            data = rx_tr_1_2,

            numTrees = 500,

            trainThreads = 4

        )

        # 预测

        rx_tr_pre <- rxPredict(

            rx_tr_mod,

            rx_tr_2

        )

        # 修改数据类型以计算auc值

        rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)

                # 计算auc值

        rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)

        perf_rate_2[j, i] <- rx_tr_roc$auc    }}    


perf_rate_2_f <- apply(perf_rate_2, 1, mean) ## 十折交叉验证平均值


## 绘图grid_search$perf <- perf_rate_2_f

ggplot(data = grid_search, aes(x = rate, y = perf)) +

    geom_point()

5)结论:rate = 4, nn = 9时最优

6.调试parms参数中的numTrees参数

1)十折交叉验证

set.seed(3)folds <- createFolds(y=1:rx_tr_nrow, k=10)

2)numTrees参数

grid_search <- expand.grid(

    numTrees = seq(100, 1000, 100))

3)构建perf矩阵放置auc值

perf_numTrees_1 <- matrix(nrow = nrow(grid_search), ncol = 10)

4)计算auc值

for (j in 1:nrow(grid_search)){

    for (i in 1:10){

        # 分割数据

        rx_tr_1 <- rx_tr[-folds[[i]], ]

        rx_tr_2 <- rx_tr[ folds[[i]], ]

        # smote 抽样

        rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')

        rx_tr_1_task_smote <- smote(

            rx_tr_1_task,

            rate = 4,

            nn = 9

        )

        rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)

        # 训练

        rx_tr_mod <- rxFastForest(

            formula = rx_formula,

            data = rx_tr_1_2,

            numTrees = grid_search[j, 'numTrees'],

            trainThreads = 4

        )

        # 预测

        rx_tr_pre <- rxPredict(

            rx_tr_mod,

            rx_tr_2

        )

        # 修改数据类型以计算auc值

        rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)

        # 计算auc值

        rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)

        perf_numTrees_1[j, i] <- rx_tr_roc$auc

    }

}


perf_numTrees_1_f <- apply(perf_numTrees_1, 1, mean)


grid_search$perf <- perf_numTrees_1_fggplot(data = grid_search, aes(x = numTrees, y = perf)) +    geom_point()

5)结论:numTrees=600时最优

7.调试parms参数中的numLeaves参数

1)十折交叉验证

set.seed(4)folds <- createFolds(y=1:rx_tr_nrow, k=10)

2)numLeaves参数

grid_search <- expand.grid(

    numTrees = 600,

    numLeaves = 2 ^ (5:9)

)

3)构建perf矩阵放置auc值

perf_numLeaves_1 <- matrix(nrow = nrow(grid_search), ncol = 10)

4)计算auc值

for (j in 1:nrow(grid_search)){

    for (i in 1:10){

        # 分割数据

        rx_tr_1 <- rx_tr[-folds[[i]], ]

        rx_tr_2 <- rx_tr[ folds[[i]], ]

        # smote 抽样

        rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')

        rx_tr_1_task_smote <- smote(

            rx_tr_1_task,

            rate = 4,

            nn = 9

        )

        rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)

       # 训练

        rx_tr_mod <- rxFastForest(

            formula = rx_formula,

            data = rx_tr_1_2,

            numTrees = grid_search[j, 'numTrees'],

            numLeaves = grid_search[j, 'numLeaves'],

            trainThreads = 4

        )

        # 预测

        rx_tr_pre <- rxPredict(

            rx_tr_mod,

            rx_tr_2

        )

        # 修改数据类型以计算auc值

        rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)

                # 计算auc值

        rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)

        perf_numLeaves_1[j, i] <- rx_tr_roc$auc    }}


perf_numLeaves_1_f <- apply(perf_numLeaves_1, 1, mean)


grid_search$perf <- perf_numLeaves_1_f

ggplot(data = grid_search, aes(x = numLeaves, y = perf)) +

    geom_point()

5)结论:numLeaves=2^7时最优

8.调试parms参数中的minSplit参数

1)十折交叉验证

set.seed(5)folds <- createFolds(y=1:rx_tr_nrow, k=10)

2)minSplit参数

grid_search <- expand.grid(

    numTrees=500,

    numLeaves = 2^7,

    minSplit = seq(5, 30, 5)

)

3)构建perf矩阵放置auc值

perf_minSplit_1 <- matrix(nrow = nrow(grid_search), ncol = 10)

4)计算auc值

for (j in 1:nrow(grid_search)){

    for (i in 1:10){

        # 分割数据

        rx_tr_1 <- rx_tr[-folds[[i]], ]

        rx_tr_2 <- rx_tr[ folds[[i]], ]

        # smote 抽样

        rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')

        rx_tr_1_task_smote <- smote(

            rx_tr_1_task,

            rate = 4,

            nn = 9

        )

        rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote) 

        # 训练

        rx_tr_mod <- rxFastForest(

            formula = rx_formula,

            data = rx_tr_1_2,

            numTrees = grid_search[j, 'numTrees'],

            numLeaves = grid_search[j, 'numLeaves'],

            minSplit = grid_search[j, 'minSplit'],

            trainThreads = 4

        )

        # 预测

        rx_tr_pre <- rxPredict(

            rx_tr_mod,

            rx_tr_2

        )

        # 修改数据类型以计算auc值

        rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)

         # 计算auc值

        rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)

        perf_minSplit_1[j, i] <- rx_tr_roc$auc

    }

}     


perf_minSplit_1_f <- apply(perf_minSplit_1, 1, mean, na.rm = TRUE)


grid_search$perf <- perf_minSplit_1_fggplot(data = grid_search, aes(x = minSplit, y = perf)) +

    geom_point()

5)结论:minSplit=25 时最优

9.调试parms参数中的exampleFraction参数

1)十折交叉验证

set.seed(6)folds <- createFolds(y=1:rx_tr_nrow, k=10)

2)exampleFraction参数

grid_search <- expand.grid(

    numTrees=500,

    numLeaves = 2^7,

    minSplit = 25,

    exampleFraction = seq(.55, .9, .05)

    )

3)构建perf矩阵放置auc值

perf_exampleFraction_1 <- matrix(nrow = nrow(grid_search), ncol = 10)

4)计算auc值

for (j in 1:nrow(grid_search)){

    for (i in 1:10){

        # 分割数据

        rx_tr_1 <- rx_tr[-folds[[i]], ]

        rx_tr_2 <- rx_tr[ folds[[i]], ]

        # smote 抽样

        rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')

        rx_tr_1_task_smote <- smote(

            rx_tr_1_task,

            rate = 4,

            nn = 9

        )

        rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)

       # 训练

        rx_tr_mod <- rxFastForest(

            formula = rx_formula,

            data = rx_tr_1_2,

            numTrees = grid_search[j, 'numTrees'],

            numLeaves = grid_search[j, 'numLeaves'],

            minSplit = grid_search[j, 'minSplit'],

            exampleFraction = grid_search[j, 'exampleFraction'],

            trainThreads = 4

        )

        # 预测

        rx_tr_pre <- rxPredict(

            rx_tr_mod,

            rx_tr_2

        )

        # 修改数据类型以计算auc值

        rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)

                # 计算auc值

        rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)

        perf_exampleFraction_1[j, i] <- rx_tr_roc$auc    }}


perf_exampleFraction_1_f <- apply(perf_exampleFraction_1, 1, mean, na.rm = TRUE)


grid_search$perf <- perf_exampleFraction_1_f

ggplot(data = grid_search, aes(x = exampleFraction, y = perf)) +

    geom_point()

5)结论:exampleFraction=.6 时最优[.55, .65]比较好,但是[.55, 1]区间变化不大

10.调试parms参数中的featureFraction参数

1)十折交叉验证

set.seed(7)folds <- createFolds(y=1:rx_tr_nrow, k=10)

2)featureFraction参数

grid_search <- expand.grid(

    numTrees=500,

    numLeaves = 2^7,

    minSplit = 25,

    exampleFraction = .6,

    featureFraction = seq(.5, .9, .05)

    )

3)构建perf矩阵放置auc值

perf_featureFraction_1 <- matrix(nrow = nrow(grid_search), ncol = 10)

4)计算auc值

for (j in 1:nrow(grid_search)){

    for (i in 1:10){

        # 分割数据

        rx_tr_1 <- rx_tr[-folds[[i]], ]

        rx_tr_2 <- rx_tr[ folds[[i]], ]

        # smote 抽样

        rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')

        rx_tr_1_task_smote <- smote(

            rx_tr_1_task,

            rate = 4,

            nn = 9

        )

        rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)

       # 训练

        rx_tr_mod <- rxFastForest(

           formula = rx_formula,

            data = rx_tr_1_2,

            numTrees = grid_search[j, 'numTrees'],

            numLeaves = grid_search[j, 'numLeaves'],

            minSplit = grid_search[j, 'minSplit'],

            exampleFraction = grid_search[j, 'exampleFraction'],

            featureFraction = grid_search[j, 'featureFraction'],

            trainThreads = 4

        )

        # 预测

        rx_tr_pre <- rxPredict(

            rx_tr_mod,

            rx_tr_2

        )

        # 修改数据类型以计算auc值

        rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)

       # 计算auc值

        rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)

        perf_featureFraction_1[j, i] <- rx_tr_roc$auc    }}

        perf_featureFraction_1_f <- apply(perf_featureFraction_1, 1, mean, na.rm = TRUE)

        grid_search$perf <- perf_featureFraction_1_f

        ggplot(data = grid_search, aes(x = featureFraction, y = perf)) +                            geom_point()

5)结论:featureFraction=.85 时最优[.75, .9]比较好

11.调试parms参数中的splitFraction参数

1)十折交叉验证

set.seed(8)folds <- createFolds(y=1:rx_tr_nrow, k=10)

2)splitFraction参数

grid_search <- expand.grid(

    numTrees=500,

    numLeaves = 2^7,

    minSplit = 25,

    exampleFraction = .6,

    featureFraction = .85,

    splitFraction = seq(.5, .95, .05)

)

3)构建perf矩阵放置auc值

perf_splitFraction_1 <- matrix(nrow = nrow(grid_search), ncol = 10)

4)计算auc值

for (j in 1:nrow(grid_search)){

    for (i in 1:10){

        # 分割数据

        rx_tr_1 <- rx_tr[-folds[[i]], ]

        rx_tr_2 <- rx_tr[ folds[[i]], ]

        # smote 抽样

        rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')

        rx_tr_1_task_smote <- smote(

            rx_tr_1_task,

            rate = 4,

            nn = 9

        )

        rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)

       # 训练

        rx_tr_mod <- rxFastForest(

            formula = rx_formula,

            data = rx_tr_1_2,

            numTrees = grid_search[j, 'numTrees'],

            numLeaves = grid_search[j, 'numLeaves'],

            minSplit = grid_search[j, 'minSplit'],

            exampleFraction = grid_search[j, 'exampleFraction'],

            featureFraction = grid_search[j, 'featureFraction'],

            splitFraction = grid_search[j, 'splitFraction'],

            trainThreads = 4

        )

       # 预测

        rx_tr_pre <- rxPredict(

            rx_tr_mod,

            rx_tr_2

        )

        # 修改数据类型以计算auc值

        rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)

                # 计算auc值

        rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)

        perf_splitFraction_1[j, i] <- rx_tr_roc$auc

    }

perf_splitFraction_1_f <- apply(perf_splitFraction_1, 1, mean, na.rm = TRUE)

grid_search$perf <- perf_splitFraction_1_f

ggplot(data = grid_search, aes(x = splitFraction, y = perf)) +

    geom_point()

5)结论:splitFraction=.5 时最优,但是变化细微

12.调试parms参数中的numBins参数

1)十折交叉验证

set.seed(9)folds <- createFolds(y=1:rx_tr_nrow, k=10)

2)numBins参数

grid_search <- expand.grid(

    numTrees=500,

    numLeaves = 2^7,

    minSplit = 25,

    exampleFraction = .6,

    featureFraction = .85,

    splitFraction = .5,

    numBins = seq(105, 505, 50)

)

3)构建perf矩阵放置auc值

perf_numBins_1 <- matrix(nrow = nrow(grid_search), ncol = 10)

for (j in 1:nrow(grid_search)){

    for (i in 1:10){

        # 分割数据

        rx_tr_1 <- rx_tr[-folds[[i]], ]

        rx_tr_2 <- rx_tr[ folds[[i]], ]

        # smote 抽样

        rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')

        rx_tr_1_task_smote <- smote(

            rx_tr_1_task,

            rate = 4,

            nn = 9

        )

        rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)

       # 训练

        rx_tr_mod <- rxFastForest(

            formula = rx_formula,

            data = rx_tr_1_2,

            numTrees = grid_search[j, 'numTrees'],

            numLeaves = grid_search[j, 'numLeaves'],

            minSplit = grid_search[j, 'minSplit'],

            exampleFraction = grid_search[j, 'exampleFraction'],

            featureFraction = grid_search[j, 'featureFraction'],

            splitFraction = grid_search[j, 'splitFraction'],

            numBins = grid_search[j, 'numBins'],

            trainThreads = 4

        )

        # 预测

        rx_tr_pre <- rxPredict(

            rx_tr_mod,

            rx_tr_2

        )

        # 修改数据类型以计算auc值

        rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)        

       # 计算auc值

        rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)

        perf_numBins_1[j, i] <- rx_tr_roc$auc    }}

        perf_numBins_1_f <- apply(perf_numBins_1, 1, mean, na.rm = TRUE)

        grid_search$perf <- perf_numBins_1_fggplot(data = grid_search, aes(x = numBins, y = perf)) +

    geom_point() +

    geom_smooth()

5)结论:numBins=350 时最优,但是变化细微

13.调试parms参数中的firstUsePenalty参数

1)十折交叉验证

set.seed(10)folds <- createFolds(y=1:rx_tr_nrow, k=10)

2)firstUsePenalty参数

grid_search <- expand.grid(

    numTrees=500,

    numLeaves = 2^7,

    minSplit = 25,

    exampleFraction = .6,

    featureFraction = .85,

    splitFraction = .5,

    numBins = 350,

    firstUsePenalty = seq(0, 1, .2)

)

3)构建perf矩阵放置auc值

perf_firstUsePenalty_1 <- matrix(nrow = nrow(grid_search), ncol = 10)

4)计算auc值

for (j in 1:nrow(grid_search)){

    for (i in 1:10){

        # 分割数据

        rx_tr_1 <- rx_tr[-folds[[i]], ]

        rx_tr_2 <- rx_tr[ folds[[i]], ]

        # smote 抽样

        rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')

        rx_tr_1_task_smote <- smote(

            rx_tr_1_task,

            rate = 4,

            nn = 9

        )

        rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)

       # 训练

        rx_tr_mod <- rxFastForest(

            formula = rx_formula,

            data = rx_tr_1_2,

            numTrees = grid_search[j, 'numTrees'],

            numLeaves = grid_search[j, 'numLeaves'],

            minSplit = grid_search[j, 'minSplit'],

            exampleFraction = grid_search[j, 'exampleFraction'],

            featureFraction = grid_search[j, 'featureFraction'],

            splitFraction = grid_search[j, 'splitFraction'],

            numBins = grid_search[j, 'numBins'],

            firstUsePenalty = grid_search[j, 'firstUsePenalty'],

            trainThreads = 4

        )

        # 预测

        rx_tr_pre <- rxPredict(

            rx_tr_mod,

            rx_tr_2

        )

       # 修改数据类型以计算auc值

        rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)

                # 计算auc值

        rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)

        perf_firstUsePenalty_1[j, i] <- rx_tr_roc$auc

    }

}

perf_firstUsePenalty_1_f <- apply(perf_firstUsePenalty_1, 1, mean, na.rm = TRUE)

grid_search$perf <- perf_firstUsePenalty_1_f

ggplot(data = grid_search, aes(x = firstUsePenalty, y = perf)) +

    geom_point() +

    geom_smooth()

5)结论:firstUsePenalty=1.2 时最优

14.调试parms参数中的gainConfLevel参数

1)十折交叉验证

set.seed(11)folds <- createFolds(y=1:rx_tr_nrow, k=10)

2)gainConfLevel参数

grid_search <- expand.grid(

    numTrees=500,

    numLeaves = 2^7,

    minSplit = 25,

    exampleFraction = .6,

    featureFraction = .85,

    splitFraction = .5,

    numBins = 350,

    firstUsePenalty = 1.2,

    gainConfLevel = seq(.01, .1, .01)

)

3)构建perf矩阵放置auc值

perf_gainConfLevel_2 <- matrix(nrow = nrow(grid_search), ncol = 10)

4)计算auc值

for (j in 1:nrow(grid_search)){

    for (i in 1:10){

        # 分割数据

        rx_tr_1 <- rx_tr[-folds[[i]], ]

        rx_tr_2 <- rx_tr[ folds[[i]], ]

        # smote 抽样

        rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')

        rx_tr_1_task_smote <- smote(

            rx_tr_1_task,

            rate = 4,

            nn = 9

        )

        rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)

       # 训练

        rx_tr_mod <- rxFastForest(

            formula = rx_formula,

            data = rx_tr_1_2,

            numTrees = grid_search[j, 'numTrees'],

            numLeaves = grid_search[j, 'numLeaves'],

            minSplit = grid_search[j, 'minSplit'],

            exampleFraction = grid_search[j, 'exampleFraction'],

            featureFraction = grid_search[j, 'featureFraction'],

            splitFraction = grid_search[j, 'splitFraction'],

            numBins = grid_search[j, 'numBins'],

            firstUsePenalty = grid_search[j, 'firstUsePenalty'],

            gainConfLevel = grid_search[j, 'gainConfLevel'],

            trainThreads = 4

        )

        # 预测

        rx_tr_pre <- rxPredict(

            rx_tr_mod,

            rx_tr_2

       )

        # 修改数据类型以计算auc值

        rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)

        # 计算auc值

        rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)

        perf_gainConfLevel_2[j, i] <- rx_tr_roc$auc    }}


perf_gainConfLevel_2_f <- apply(perf_gainConfLevel_2, 1, mean, na.rm = TRUE)


grid_search$perf <- perf_gainConfLevel_2_f

ggplot(data = grid_search, aes(x = gainConfLevel, y = perf)) +

    geom_point() +

    geom_smooth()

5)结论:gainConfLevel=0.05 时最优

15.再次调试parms参数中的numTrees参数

1)十折交叉验证

set.seed(12)folds <- createFolds(y=1:rx_tr_nrow, k=10)

2)numTrees参数

grid_search <- expand.grid(

    numTrees=seq(200, 600, 100),

    numLeaves = 2^7,

    minSplit = 25,

    exampleFraction = .6,

    featureFraction = .85,

    splitFraction = .5,

    numBins = 350,

    firstUsePenalty = 1.2,

    gainConfLevel = .05

)

3)构建perf矩阵放置auc值

perf_numTrees_2 <- matrix(nrow = nrow(grid_search), ncol = 10)

4)计算auc值

for (j in 1:nrow(grid_search)){    for (i in 1:10){

        # 分割数据

        rx_tr_1 <- rx_tr[-folds[[i]], ]

        rx_tr_2 <- rx_tr[ folds[[i]], ]

        # smote 抽样

        rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')

        rx_tr_1_task_smote <- smote(

            rx_tr_1_task,

            rate = 4, 

           nn = 9

        )

        rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)

       # 训练

        rx_tr_mod <- rxFastForest(

            formula = rx_formula,

            data = rx_tr_1_2,

            numTrees = grid_search[j, 'numTrees'],

            numLeaves = grid_search[j, 'numLeaves'],

            minSplit = grid_search[j, 'minSplit'],

            exampleFraction = grid_search[j, 'exampleFraction'],

            featureFraction = grid_search[j, 'featureFraction'],

            splitFraction = grid_search[j, 'splitFraction'],

            numBins = grid_search[j, 'numBins'],

            firstUsePenalty = grid_search[j, 'firstUsePenalty'],

            gainConfLevel = grid_search[j, 'gainConfLevel'],

            trainThreads = 4

        )

        # 预测

        rx_tr_pre <- rxPredict(

            rx_tr_mod,

            rx_tr_2

        )

        # 修改数据类型以计算auc值

        rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)

       # 计算auc值

        rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)

        perf_numTrees_2[j, i] <- rx_tr_roc$auc    }}

perf_numTrees_2_f <- apply(perf_numTrees_2, 1, mean, na.rm = TRUE)

grid_search$perf <- perf_numTrees_2_f

ggplot(data = grid_search, aes(x = numTrees, y = perf)) +

    geom_point() +

    geom_smooth()

5)结论:numTrees=600 时最优

结论:由于参数不发生变化,因此停止训练

三、集成学习

0)参数

set.seed(1)grid_search <- expand.grid(

    numTrees=sample(550:650, 10, replace = FALSE),

    numLeaves = sample(124:132, 4, replace = FALSE),

    minSplit = sample(24:26, 2, replace = FALSE),

    exampleFraction = sample(550:650, 10, replace = FALSE) / 1000,

    featureFraction = sample(750:900, 10, replace = FALSE) / 1000,

    splitFraction = sample(45:55, 5, replace = FALSE) / 100,

    numBins = sample(320:380, 5, replace = FALSE),

    firstUsePenalty = sample(115:125, 5, replace = FALSE) / 100,

    gainConfLevel = sample(45:55, 5, replace = FALSE) / 1000)

sample_ind <- sample(dim(grid_search)[1], 100, replace = FALSE)

grid_search2 <- grid_search[sample_ind, ]

rm(grid_search)

1) 放置结果

rxfastforest.pred <- list()

2)训练

for (i in 1:nrow(grid_search2)[1]){

        # smote 抽样

    rx_tr_task <- makeClassifTask(data = rx_tr, target = 'TARGET')

    rx_tr_task_smote <- smote(

        rx_tr_task,

        rate = 4,

        nn = 9

    )

    rx_tr_2 <- getTaskData(rx_tr_task_smote)

       # 训练

    rx_tr_mod <- rxFastForest(

        formula = rx_formula,

        data = rx_tr_2,

        numTrees = grid_search2[i, 'numTrees'],

        numLeaves = grid_search2[i, 'numLeaves'],

        minSplit = grid_search2[i, 'minSplit'],

        exampleFraction = grid_search2[i, 'exampleFraction'],

        featureFraction = grid_search2[i, 'featureFraction'],

        splitFraction = grid_search2[i, 'splitFraction'],

        numBins = grid_search2[i, 'numBins'],

        firstUsePenalty = grid_search2[i, 'firstUsePenalty'],

        gainConfLevel = grid_search2[i, 'gainConfLevel'],

        trainThreads = 4

    )

    # 预测

    rx_tr_pre <- rxPredict(

        rx_tr_mod,

        rx_te

    )

    rxfastforest.pred[[i]] <- rx_tr_pre$Probability.1

}

3)结果

rxfastforest.pred2 <- matrix(unlist(rxfastforest.pred), ncol = 100)

rxfastforest.pred3 <- data.frame(prob1 = apply(rxfastforest.pred2, 1, mean))

4)输出

write.csv(rxfastforest.pred3, "C:/Users/Administrator/Documents/kaggle/scs_rf/rxfastforest.pred1.csv")



 往期精彩内容整理合集 

2017年R语言发展报告(国内)

R语言中文社区历史文章整理(作者篇)

R语言中文社区历史文章整理(类型篇)

公众号后台回复关键字即可学习

回复 R                  R语言快速入门及数据挖掘 
回复 Kaggle案例  Kaggle十大案例精讲(连载中)
回复 文本挖掘      手把手教你做文本挖掘
回复 可视化          R语言可视化在商务场景中的应用 
回复 大数据         大数据系列免费视频教程 
回复 量化投资      张丹教你如何用R语言量化投资 
回复 用户画像      京东大数据,揭秘用户画像
回复 数据挖掘     常用数据挖掘算法原理解释与应用
回复 机器学习     人工智能系列之机器学习与实践
回复 爬虫            R语言爬虫实战案例分享


推荐 0
本文由 R语言中文社区 创作,采用 知识共享署名-相同方式共享 3.0 中国大陆许可协议 进行许可。
转载、引用前需联系作者,并署名作者且注明文章出处。
本站文章版权归原作者及原出处所有 。内容为作者个人观点, 并不代表本站赞同其观点和对其真实性负责。本站是一个个人学习交流的平台,并不用于任何商业目的,如果有任何问题,请及时联系我们,我们将根据著作权人的要求,立即更正或者删除有关内容。本站拥有对此声明的最终解释权。

0 个评论

要回复文章请先登录注册