lightgbm algorithm case of kaggle(下)

浏览: 1812

苏高生,西南财经大学统计学硕士毕业,现就职于中国电信,主要负责企业存量客户大数据分析、数据建模。研究方向:机器学习,最喜欢的编程语言:R语言,没有之一。

E-mail:sugs01@outlook.com

往期回顾:

Xgboost算法——Kaggle案例

The rxfastforest algorithm case of kaggle


紧接上文:lightgbm algorithm case of kaggle(上)

各位看客,请继续......


五、二次调参

1.调试weight参数

grid_search <- expand.grid(

    learning_rate = .125,

    num_leaves = 600,

    max_bin = 30,

    min_data_in_bin = 64,

    feature_fraction = .64,

    min_sum_hessian = .004,

    lambda_l1 = .002,

    lambda_l2 = .008,

    drop_rate = .3, 

   max_drop = 5)

perf_weight_2 <- numeric(length = nrow(grid_search))

for(i in 1:20){

    lgb_weight <- (lgb_tr$TARGET * i + 1) / sum(lgb_tr$TARGET * i + 1)

        lgb_train <- lgb.Dataset(

        data = data.matrix(lgb_tr[, 1:137]),

        label = lgb_tr$TARGET,

       free_raw_data = FALSE,

        weight = lgb_weight

   )

        # 参数

    params <- list(

        objective = 'binary',

        metric = 'auc',

        learning_rate = grid_search[1, 'learning_rate'],

        num_leaves = grid_search[1, 'num_leaves'],

        max_bin = grid_search[1, 'max_bin'], 

       min_data_in_bin = grid_search[1, 'min_data_in_bin'],

        feature_fraction = grid_search[1, 'feature_fraction'],

        min_sum_hessian = grid_search[1, 'min_sum_hessian'],

        lambda_l1 = grid_search[1, 'lambda_l1'],

        lambda_l2 = grid_search[1, 'lambda_l2'], 

       drop_rate = grid_search[1, 'drop_rate'],

        max_drop = grid_search[1, 'max_drop']

    )

    # 交叉验证

    lgb_tr_mod <- lgb.cv(

        params,

        data = lgb_train,

        nrounds = 300, 

       stratified = TRUE,

        nfold = 10, 

       learning_rate = .1,

        num_threads = 2,

        early_stopping_rounds = 10

    )

    perf_weight_2[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]}

ggplot(data.frame(num = 1:length(perf_weight_2), perf = perf_weight_2), aes(x = num, y = perf)) +

    geom_point() +

    geom_smooth()

结论:从此图可知auc值在weight>=2时auc 趋于稳定, weight=8时取最大值

2.调试learning_rate参数

grid_search <- expand.grid(

    learning_rate = seq(.05, .5, .01),

    num_leaves = 600,

    max_bin = 30,

    min_data_in_bin = 64,

    feature_fraction = .64,

    min_sum_hessian = .004,

    lambda_l1 = .002,

    lambda_l2 = .008,

    drop_rate = .3,

    max_drop = 5)

perf_learning_rate_1 <- numeric(length = nrow(grid_search))

for(i in 1:nrow(grid_search)){

    lgb_weight <- (lgb_tr$TARGET * 8 + 1) / sum(lgb_tr$TARGET * 8 + 1)

        lgb_train <- lgb.Dataset(

        data = data.matrix(lgb_tr[, 1:137]),

        label = lgb_tr$TARGET,

        free_raw_data = FALSE,

        weight = lgb_weight

    )

        # 参数

    params <- list(

        objective = 'binary',

        metric = 'auc',

        learning_rate = grid_search[i, 'learning_rate'],

        num_leaves = grid_search[i, 'num_leaves'],

        max_bin = grid_search[i, 'max_bin'],

        min_data_in_bin = grid_search[i, 'min_data_in_bin'], 

       feature_fraction = grid_search[i, 'feature_fraction'],

        min_sum_hessian = grid_search[i, 'min_sum_hessian'],

        lambda_l1 = grid_search[i, 'lambda_l1'],

        lambda_l2 = grid_search[i, 'lambda_l2'],

        drop_rate = grid_search[i, 'drop_rate'], 

        max_drop = grid_search[i, 'max_drop'] 

    )

    # 交叉验证

    lgb_tr_mod <- lgb.cv(

        params,

        data = lgb_train,

        nrounds = 300,

        stratified = TRUE,

        nfold = 10, 

       num_threads = 2,

        early_stopping_rounds = 10

    )

    perf_learning_rate_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]}

grid_search$perf <- perf_learning_rate_1

ggplot(data = grid_search, aes(x = learning_rate, y = perf)) +

    geom_point() +

    geom_smooth()

结论:learning_rate=.2时,auc最大

3.调试num_leaves参数

grid_search <- expand.grid(

    learning_rate = .2,

    num_leaves = seq(50, 800, 50),

    max_bin = 30,

    min_data_in_bin = 64,

    feature_fraction = .64,

    min_sum_hessian = .004,

    lambda_l1 = .002,

    lambda_l2 = .008,

    drop_rate = .3,

    max_drop = 5

)

perf_num_leaves_1 <- numeric(length = nrow(grid_search))

for(i in 1:nrow(grid_search)){

        lgb_weight <- (lgb_tr$TARGET * 8 + 1) / sum(lgb_tr$TARGET * 8 + 1)

        lgb_train <- lgb.Dataset(

        data = data.matrix(lgb_tr[, 1:137]), 

        label = lgb_tr$TARGET,

        free_raw_data = FALSE, 

       weight = lgb_weight

    )

        # 参数 

    params <- list(

        objective = 'binary',

        metric = 'auc', 

       learning_rate = grid_search[i, 'learning_rate'],

        num_leaves = grid_search[i, 'num_leaves'],

        max_bin = grid_search[i, 'max_bin'], 

       min_data_in_bin = grid_search[i, 'min_data_in_bin'],

        feature_fraction = grid_search[i, 'feature_fraction'],

        min_sum_hessian = grid_search[i, 'min_sum_hessian'],

        lambda_l1 = grid_search[i, 'lambda_l1'],

        lambda_l2 = grid_search[i, 'lambda_l2'],

        drop_rate = grid_search[i, 'drop_rate'],

        max_drop = grid_search[i, 'max_drop']

    )

    # 交叉验证

    lgb_tr_mod <- lgb.cv(

        params,

        data = lgb_train,

        nrounds = 300,

        stratified = TRUE,

        nfold = 10,

        num_threads = 2,

        early_stopping_rounds = 10

    )

    perf_num_leaves_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]}

grid_search$perf <- perf_num_leaves_1

ggplot(data = grid_search, aes(x = num_leaves, y = perf)) +

    geom_point() +

    geom_smooth()

结论:num_leaves=300时,auc最大

4.调试max_bin参数

grid_search <- expand.grid(

    learning_rate = .2,

    num_leaves = 300,

    max_bin = seq(30, 150, 10),

    min_data_in_bin = 64,

    feature_fraction = .64,

    min_sum_hessian = .004,

    lambda_l1 = .002,

    lambda_l2 = .008, 

   drop_rate = .3,

    max_drop = 5

)

perf_max_bin_1 <- numeric(length = nrow(grid_search))

for(i in 1:nrow(grid_search)){

    lgb_weight <- (lgb_tr$TARGET * 8 + 1) / sum(lgb_tr$TARGET * 8 + 1)

        lgb_train <- lgb.Dataset(

        data = data.matrix(lgb_tr[, 1:137]),

        label = lgb_tr$TARGET, 

        free_raw_data = FALSE, 

       weight = lgb_weight 

    )

        # 参数

    params <- list(

        objective = 'binary',

        metric = 'auc', 

       learning_rate = grid_search[i, 'learning_rate'],

        num_leaves = grid_search[i, 'num_leaves'],

        max_bin = grid_search[i, 'max_bin'],

        min_data_in_bin = grid_search[i, 'min_data_in_bin'],

        feature_fraction = grid_search[i, 'feature_fraction'],

        min_sum_hessian = grid_search[i, 'min_sum_hessian'],

        lambda_l1 = grid_search[i, 'lambda_l1'],

        lambda_l2 = grid_search[i, 'lambda_l2'],

        drop_rate = grid_search[i, 'drop_rate'],

        max_drop = grid_search[i, 'max_drop']

    )

    # 交叉验证

    lgb_tr_mod <- lgb.cv(

        params,

        data = lgb_train,

        nrounds = 300, 

       stratified = TRUE,

        nfold = 10,

        num_threads = 2,

        early_stopping_rounds = 10

    )

    perf_max_bin_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]}

grid_search$perf <- perf_max_bin_1

ggplot(data = grid_search, aes(x = max_bin, y = perf)) +

    geom_point() +

    geom_smooth()

结论:max_bin=120时,auc最大

5.调试min_data_in_bin参数

grid_search <- expand.grid( 

    learning_rate = .2, 

   num_leaves = 300, 

   max_bin = 120, 

   min_data_in_bin = seq(20, 100, 5),  

  feature_fraction = .64, 

   min_sum_hessian = .004, 

   lambda_l1 = .002, 

   lambda_l2 = .008,  

  drop_rate = .3, 

   max_drop = 5

)

perf_min_data_in_bin_1 <- numeric(length = nrow(grid_search))

for(i in 1:nrow(grid_search)){ 

    lgb_weight <- (lgb_tr$TARGET * 8 + 1) / sum(lgb_tr$TARGET * 8 + 1)

        lgb_train <- lgb.Dataset( 

        data = data.matrix(lgb_tr[, 1:137]),   

     label = lgb_tr$TARGET,    

    free_raw_data = FALSE,     

   weight = lgb_weight 

    )    

    # 参数  

  params <- list(   

     objective = 'binary',   

     metric = 'auc',

        learning_rate = grid_search[i, 'learning_rate'], 

       num_leaves = grid_search[i, 'num_leaves'], 

       max_bin = grid_search[i, 'max_bin'], 

       min_data_in_bin = grid_search[i, 'min_data_in_bin'],

        feature_fraction = grid_search[i, 'feature_fraction'], 

       min_sum_hessian = grid_search[i, 'min_sum_hessian'],  

      lambda_l1 = grid_search[i, 'lambda_l1'], 

       lambda_l2 = grid_search[i, 'lambda_l2'],

        drop_rate = grid_search[i, 'drop_rate'], 

      max_drop = grid_search[i, 'max_drop'] 

    )   

 # 交叉验证

    lgb_tr_mod <- lgb.cv(  

      params,   

     data = lgb_train,  

      nrounds = 300,   

     stratified = TRUE,  

      nfold = 10,  

      num_threads = 2,  

      early_stopping_rounds = 10

    )  

  perf_min_data_in_bin_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]}

grid_search$perf <- perf_min_data_in_bin_1

ggplot(data = grid_search, aes(x = min_data_in_bin, y = perf)) +

    geom_point() +  

  geom_smooth()

结论:min_data_in_bin=20时,auc最大

6.调试feature_fraction参数

grid_search <- expand.grid(

    learning_rate = .2,

    num_leaves = 300,

    max_bin = 120, 

   min_data_in_bin = 20, 

   feature_fraction = .5, 

   min_sum_hessian = .004,

    lambda_l1 = .002, 

   lambda_l2 = .008,

    drop_rate = .3,  

  max_drop = 5)perf_feature_fraction_1 <- numeric(length = nrow(grid_search))

for(i in 1:nrow(grid_search)){  

  lgb_weight <- (lgb_tr$TARGET * 8 + 1) / sum(lgb_tr$TARGET * 8 + 1)  

      lgb_train <- lgb.Dataset(  

      data = data.matrix(lgb_tr[, 1:137]), 

        label = lgb_tr$TARGET, 

        free_raw_data = FALSE,   

     weight = lgb_weight   

 )   

     # 参数

    params <- list(      

   objective = 'binary',  

   metric = 'auc',   

   learning_rate = grid_search[i, 'learning_rate'],  

   num_leaves = grid_search[i, 'num_leaves'],

   max_bin = grid_search[i, 'max_bin'],  

    min_data_in_bin = grid_search[i, 'min_data_in_bin'],   

    feature_fraction = grid_search[i, 'feature_fraction'],      

   min_sum_hessian = grid_search[i, 'min_sum_hessian'],  

    lambda_l1 = grid_search[i, 'lambda_l1'],    

    lambda_l2 = grid_search[i, 'lambda_l2'],    

    drop_rate = grid_search[i, 'drop_rate'], 

     max_drop = grid_search[i, 'max_drop']

    )    

# 交叉验证

    lgb_tr_mod <- lgb.cv(

        params,

        data = lgb_train,

        nrounds = 300,

        stratified = TRUE,

        nfold = 10,

        num_threads = 2,

        early_stopping_rounds = 10

    )

    perf_feature_fraction_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]}

grid_search$perf <- perf_feature_fraction_1

ggplot(data = grid_search, aes(x = feature_fraction, y = perf)) +

    geom_point() +

    geom_smooth()

结论:feature_fraction=.5时,auc最大,=.62时也较好

7.调试min_sum_hessian参数

grid_search <- expand.grid(

    learning_rate = .2,

    num_leaves = 300, 

   max_bin = 120,

    min_data_in_bin = 20,

    feature_fraction = .5,

   min_sum_hessian = 0,

    lambda_l1 = .002,

    lambda_l2 = .008,

    drop_rate = .3,

    max_drop = 5

)

perf_min_sum_hessian_1 <- numeric(length = nrow(grid_search))

for(i in 1:nrow(grid_search)){

    lgb_weight <- (lgb_tr$TARGET * 8 + 1) / sum(lgb_tr$TARGET * 8 + 1)

        lgb_train <- lgb.Dataset(

        data = data.matrix(lgb_tr[, 1:137]), 

        label = lgb_tr$TARGET, 

        free_raw_data = FALSE,

        weight = lgb_weight

    ) 

        # 参数

    params <- list(

        objective = 'binary',

        metric = 'auc',

        learning_rate = grid_search[i, 'learning_rate'],

        num_leaves = grid_search[i, 'num_leaves'],

        max_bin = grid_search[i, 'max_bin'],

        min_data_in_bin = grid_search[i, 'min_data_in_bin'], 

       feature_fraction = grid_search[i, 'feature_fraction'],

        min_sum_hessian = grid_search[i, 'min_sum_hessian'], 

       lambda_l1 = grid_search[i, 'lambda_l1'],

        lambda_l2 = grid_search[i, 'lambda_l2'],

        drop_rate = grid_search[i, 'drop_rate'],

        max_drop = grid_search[i, 'max_drop']

    )

    # 交叉验证

    lgb_tr_mod <- lgb.cv(

        params,

        data = lgb_train,

        nrounds = 300,

        stratified = TRUE,

        nfold = 10,

        num_threads = 2,

        early_stopping_rounds = 10

    )

    perf_min_sum_hessian_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]}

grid_search$perf <- perf_min_sum_hessian_1

ggplot(data = grid_search, aes(x = min_sum_hessian, y = perf)) +

    geom_point() +

    geom_smooth()

结论:min_sum_hessian与auc呈负相关,min_sum_hessian=0时,取min_sum_hessian=0

8.调试lambda参数

grid_search <- expand.grid(

    learning_rate = .2, 

   num_leaves = 300, 

   max_bin = 120, 

   min_data_in_bin = 20,

    feature_fraction = .5,

    min_sum_hessian = 0, 

   lambda_l1 = seq(0, .01, .002),

    lambda_l2 = seq(0, .01, .002),

    drop_rate = .3,    max_drop = 5

)

perf_lambda_1 <- numeric(length = nrow(grid_search))

for(i in 1:nrow(grid_search)){

    lgb_weight <- (lgb_tr$TARGET * 8 + 1) / sum(lgb_tr$TARGET * 8 + 1)

        lgb_train <- lgb.Dataset(

        data = data.matrix(lgb_tr[, 1:137]),

        label = lgb_tr$TARGET,

        free_raw_data = FALSE,

        weight = lgb_weight 

    )

        # 参数

    params <- list(

        objective = 'binary',

        metric = 'auc',

        learning_rate = grid_search[i, 'learning_rate'], 

       num_leaves = grid_search[i, 'num_leaves'],

        max_bin = grid_search[i, 'max_bin'], 

       min_data_in_bin = grid_search[i, 'min_data_in_bin'],

        feature_fraction = grid_search[i, 'feature_fraction'], 

       min_sum_hessian = grid_search[i, 'min_sum_hessian'],

        lambda_l1 = grid_search[i, 'lambda_l1'],

        lambda_l2 = grid_search[i, 'lambda_l2'],

        drop_rate = grid_search[i, 'drop_rate'],

        max_drop = grid_search[i, 'max_drop']

    )

    # 交叉验证

    lgb_tr_mod <- lgb.cv(

        params, 

       data = lgb_train,

        nrounds = 300, 

       stratified = TRUE, 

       nfold = 10,

        num_threads = 2,

        early_stopping_rounds = 10

    )

    perf_lambda_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]}

grid_search$perf <- perf_lamda_1

ggplot(data = grid_search, aes(x = lambda_l1, y = perf)) +

    geom_point() + 

    facet_wrap(~ lambda_l2, nrow = 5)

结论:lambda与auc呈负相关,取lambda_l1=.002, lambda_l2 = .01

9.调试drop_rate参数

grid_search <- expand.grid(

    learning_rate = .2,

    num_leaves = 300,

    max_bin = 120,

    min_data_in_bin = 20,

    feature_fraction = .5,

    min_sum_hessian = 0,

    lambda_l1 = .002,

    lambda_l2 = .01,

    drop_rate = seq(0, .5, .05),

    max_drop = 5)perf_drop_rate_1 <- numeric(length = nrow(grid_search))

for(i in 1:nrow(grid_search)){

    lgb_weight <- (lgb_tr$TARGET * 8 + 1) / sum(lgb_tr$TARGET * 8 + 1)

        lgb_train <- lgb.Dataset(

        data = data.matrix(lgb_tr[, 1:137]),

        label = lgb_tr$TARGET,

        free_raw_data = FALSE, 

       weight = lgb_weight

    ) 

        # 参数

    params <- list(

        objective = 'binary',

        metric = 'auc', 

       learning_rate = grid_search[i, 'learning_rate'],

        num_leaves = grid_search[i, 'num_leaves'], 

       max_bin = grid_search[i, 'max_bin'],

        min_data_in_bin = grid_search[i, 'min_data_in_bin'],

       feature_fraction = grid_search[i, 'feature_fraction'],

        min_sum_hessian = grid_search[i, 'min_sum_hessian'],

        lambda_l1 = grid_search[i, 'lambda_l1'],

        lambda_l2 = grid_search[i, 'lambda_l2'], 

       drop_rate = grid_search[i, 'drop_rate'], 

       max_drop = grid_search[i, 'max_drop']

    ) 

    # 交叉验证

    lgb_tr_mod <- lgb.cv(

        params, 

       data = lgb_train,

        nrounds = 300,

        stratified = TRUE,

        nfold = 10,  

      num_threads = 2,

        early_stopping_rounds = 10

    ) 

   perf_drop_rate_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]}

grid_search$perf <- perf_drop_rate_1

ggplot(data = grid_search, aes(x = drop_rate, y = perf)) +

    geom_point()

结论:drop_rate=.3时取到最大值,与第一次调参没有变化

10.调试max_drop参数

grid_search <- expand.grid(

   learning_rate = .2,

    num_leaves = 300,

    max_bin = 120,

    min_data_in_bin = 20,

    feature_fraction = .5, 

   min_sum_hessian = 0,

    lambda_l1 = .002, 

   lambda_l2 = .01, 

   drop_rate = .3,

    max_drop = seq(19, 29, 2)

)

perf_max_drop_1 <- numeric(length = nrow(grid_search))

for(i in 1:nrow(grid_search)){

    lgb_weight <- (lgb_tr$TARGET * 8 + 1) / sum(lgb_tr$TARGET * 8 + 1)

        lgb_train <- lgb.Dataset(

        data = data.matrix(lgb_tr[, 1:137]),

        label = lgb_tr$TARGET,

        free_raw_data = FALSE, 

       weight = lgb_weight

    )   

     # 参数 

    params <- list(

        objective = 'binary', 

       metric = 'auc',

        learning_rate = grid_search[i, 'learning_rate'],

        num_leaves = grid_search[i, 'num_leaves'], 

       max_bin = grid_search[i, 'max_bin'],

        min_data_in_bin = grid_search[i, 'min_data_in_bin'],

        feature_fraction = grid_search[i, 'feature_fraction'],

        min_sum_hessian = grid_search[i, 'min_sum_hessian'],

        lambda_l1 = grid_search[i, 'lambda_l1'], 

       lambda_l2 = grid_search[i, 'lambda_l2'],

        drop_rate = grid_search[i, 'drop_rate'], 

       max_drop = grid_search[i, 'max_drop'] 

    ) 

    # 交叉验证

    lgb_tr_mod <- lgb.cv(

        params, 

       data = lgb_train,

       nrounds = 300,

        stratified = TRUE,

        nfold = 10, 

       num_threads = 2,

        early_stopping_rounds = 10

    )

    perf_max_drop_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]}grid_search$perf <- perf_max_drop_1ggplot(data = grid_search, aes(x = max_drop, y = perf)) +    geom_point()

结论:max_drop=23时取到最大值

六、集成学习

1)参数

set.seed(1)

grid_search <- expand.grid(

    learning_rate = sample(115:125, 10, replace = FALSE) / 100,

    num_leaves = sample(250:350, 10, replace = FALSE),

    max_bin = sample(115:125, 5, replace = FALSE),

    min_data_in_bin = sample(18:22, replace = FALSE),

    feature_fraction = c(.5, .62),

    min_sum_hessian = 0,

    lambda_l1 = .002,

    lambda_l2 = c(.008, .009, .01),

    drop_rate = sample(126:134, 4, replace = FALSE) / 1000,

    max_drop = c(23, 27, 29)

)

sample_ind <- sample(dim(grid_search)[1], 100, replace = FALSE)

lgb.pred <- list()grid_search2 <- grid_search[sample_ind, ]rm(grid_search)

2)权重

lgb_weight <- (lgb_tr$TARGET * 8 + 1) / sum(lgb_tr$TARGET * 8 + 1)

3)训练数据集

lgb_train <- lgb.Dataset(

   data = data.matrix(lgb_tr[, 1:137]),

    label = lgb_tr$TARGET,

    free_raw_data = FALSE,

   weight = lgb_weight

)

4)训练

for (i in 1:nrow(grid_search2)[1]){ 

        # 参数

    params <- list(

        objective = 'binary',

        metric = 'auc', 

       learning_rate = grid_search2[i, 'learning_rate'],

        num_leaves = grid_search2[i, 'num_leaves'],

        max_bin = grid_search2[i, 'max_bin'],

        min_data_in_bin = grid_search2[i, 'min_data_in_bin'], 

       feature_fraction = grid_search2[i, 'feature_fraction'],

        min_sum_hessian = grid_search2[i, 'min_sum_hessian'],

        lambda_l1 = grid_search2[i, 'lambda_l1'], 

       lambda_l2 = grid_search2[i, 'lambda_l2'], 

       drop_rate = grid_search2[i, 'drop_rate'], 

       max_drop = grid_search2[i, 'max_drop']

    ) 

    # 模型

    lgb_mod <- lightgbm(

        params = params,

        data = lgb_train, 

       nrounds = 300, 

       early_stopping_rounds = 10,

        num_threads = 2

    )  

  # 预测

    lgb.pred[[i]] <- predict(lgb_mod, data.matrix(lgb_te))}

5)结果

lgb.pred2 <- matrix(unlist(lgb.pred), ncol = 100)

lgb.pred3 <- data.frame(prob1 = apply(lgb.pred2, 1, mean))

6)输出

write.csv(lgb.pred3, "C:/Users/Administrator/Documents/kaggle/scs_lgb/lgb.pred1.csv"



往期精彩内容整理合集 

2017年R语言发展报告(国内)

R语言中文社区历史文章整理(作者篇)

R语言中文社区历史文章整理(类型篇)


公众号后台回复关键字即可学习

回复 R                  R语言快速入门及数据挖掘 
回复 Kaggle案例  Kaggle十大案例精讲(连载中)
回复 文本挖掘      手把手教你做文本挖掘
回复 可视化          R语言可视化在商务场景中的应用 
回复 大数据         大数据系列免费视频教程 
回复 量化投资      张丹教你如何用R语言量化投资 
回复 用户画像      京东大数据,揭秘用户画像
回复 数据挖掘     常用数据挖掘算法原理解释与应用
回复 机器学习     人工智能系列之机器学习与实践
回复 爬虫            R语言爬虫实战案例分享

推荐 0
本文由 R语言中文社区 创作,采用 知识共享署名-相同方式共享 3.0 中国大陆许可协议 进行许可。
转载、引用前需联系作者,并署名作者且注明文章出处。
本站文章版权归原作者及原出处所有 。内容为作者个人观点, 并不代表本站赞同其观点和对其真实性负责。本站是一个个人学习交流的平台,并不用于任何商业目的,如果有任何问题,请及时联系我们,我们将根据著作权人的要求,立即更正或者删除有关内容。本站拥有对此声明的最终解释权。

0 个评论

要回复文章请先登录注册