Ben老师:我作了三个模型,Logit ridge lasso 请老师讲下,选哪一个模型较好
一、AUC结果
Logistic-AUC Ridge-AUC Lasso-AUC
0.836 0.854 0.833
二、列联表
Logistic 0 1 Ridge 0 1 Lasso 0 1
0 330 48 0 330 37 0 321 43
1 66 248 1 66 259 1 75 253
三、代码如下:
setwd('E:/finance_R')
telecom<-read.csv("telecom_churn.csv") #电信公司希望针对客户的信息预测其流失可能性
require(mice)
md.pattern(telecom) #无缺失
#-------------------------------------------------观察1
#subscriberID="个人客户的ID"
#churn="是否流失:1=流失";
table(telecom$churn)
prop.table(table(telecom$churn)) #比例相差不大
#Age="年龄"
hist(telecom$AGE,prob=T,main='年龄分布',breaks=30);lines(density(telecom$AGE)) #右偏,数值
require(e1071)
skewness(telecom$AGE) #右偏不严重,先按正态算,之后再说
#incomeCode="用户居住区域平均收入的代码"
attach(telecom)
#duration="在网时长"
hist(duration,prob=T,main='在网时间',breaks=30) #右偏,数值
#peakMinAv="统计期间内最高单月通话时长"
hist(peakMinAv,prob=T,main='通话时长',breaks=30) #象正态,数值
skewness(peakMinAv) #1.005058, 肯定取LOG
#peakMinDiff="统计期间结束月份与开始月份相比通话时长增加数量"
hist(peakMinDiff,prob=T,main='统计期间结束月份与开始月份相比通话时长增加数量',breaks=30) #不用看,正态
#posTrend="该用户通话时长是否呈现出上升态势:是=1"
#negTrend="该用户通话时长是否呈现出下降态势:是=1"
#nrProm="电话公司营销的数量"
hist(nrProm,prob=T,main='营销次数',breaks=30) #先当数量来吧,不行再转因子
#prom="最近一个月是否被营销过:是=1"
#curPlan="统计时间开始时套餐类型:1=最高通过200分钟;2=300分钟;3=350分钟;4=500分钟"
#avPlan="统计期间内平均套餐类型"
#planChange="统计期间是否更换过套餐:1=是"
#posPlanChange="统计期间是否提高套餐:1=是"
#negPlanChange="统计期间是否降低套餐:1=是"
#call_10086="拨打10086的次数"
hist(call_10086,prob=T,main='拨打10086的次数',breaks=30) #照理说应是数值,可就是0,1
detach(telecom)
#-----------------------------------------观察2--形成分类变量数据集和连续变量数据集
tele_num<-telecom[,c(2,4,7,9,10,13,20)]
#6个数值(4.AGE\7.duration\9.peakMinAv\10.peakMinDiff\13.nrProm\20.call_10086)再加上2.因变量
tele_fac<-telecom[,c(2,3,5,6,8,11,12,14,15,16,17,18,19)]
#12个因子(3.gander\5.edu_class\6.incomeCode\8.feton\11.posTrend\12.negTrend\14.prom
# \15.curPlan\16.avPlan\17.planChange\18.posPlanChange\19.negPlanChange)
#-----------------------------------------观察3--连续变量的相关关系检测
a<-dim(tele_num)[2]
b1<-matrix(nrow=(a-1),ncol=2)
colnames(b1)<-c("Estimate","P-values")
rownames(b1)<-c( names(tele_num)[2:a])
for(i in 2:a){
c<-summary(glm(tele_num$churn~tele_num[,i],family=binomial(link='logit')))
b1[i-1,1]<-c$coefficients[2,1]
b1[i-1,2]<-c$coefficients[2,4]
}
print(b1) #peakMinAv0.03390755" nrProm0.4250104 这两个变量有可能没用
#-----------抽样再检一次,抽20%个
require(caret)
num_20<-createDataPartition(y=tele_num$churn,p=0.2,list=F)
tele_num_20<-tele_num[num_20,]
prop.table(table(tele_num_20$churn))
b2<-matrix(nrow=(a-1),ncol=2)
colnames(b2)<-c("Estimate_20%","P-values_20%")
rownames(b2)<-c( names(tele_num_20)[2:a])
for(i in 2:a){
c<-summary(glm(tele_num_20$churn~tele_num_20[,i],family=binomial(link='logit')))
b2[i-1,1]<-c$coefficients[2,1]
b2[i-1,2]<-c$coefficients[2,4]
}
print(b2) # 验证 peakMinAv和nrProm 确实没用
#----------------------------------------观察4--分类变量的相关关系检测
d<-dim(tele_fac)[2]
for(i in 1:d){
tele_fac[,i]<-as.factor(tele_fac[,i]) #全转因子
}
e1<-matrix(nrow=(d-1),ncol=2)
colnames(e1)<-c("X-squared","P-values")
rownames(e1)<-c(names(tele_fac)[2:d])
for(i in 2:d){
f<-chisq.test(x=tele_fac[,i],y=tele_fac$churn)
e1[i-1,1]<-f$ statistic
e1[i-1,2]<-f$ p.value
}
print(e1) #prom\planChange\posPlanChange\negPlanChange 4个变量不显著去掉
#-----------抽样再检一次,抽20%个
fac_20<-createDataPartition(y=tele_fac$churn,p=0.2,list=F)
tele_fac_20<-tele_fac[fac_20,]
prop.table(table(tele_num_20$churn))
e2<-matrix(nrow=(d-1),ncol=2)
colnames(e2)<-c("X-squared_20%","P-values_20%")
rownames(e2)<-c(names(tele_fac_20)[2:d])
for(i in 2:d){
f<-chisq.test(x=tele_fac_20[,i],y=tele_fac_20$churn)
e2[i-1,1]<-f$ statistic
e2[i-1,2]<-f$ p.value
}
print(e2) # #prom\planChange\posPlanChange\negPlanChange 再次证明:4个变量不显著去掉
#-----------------------总结--------------
#连续留下4个AGE、duration、peakMinDiff、call_10086
#分类留下8个gender\edu_class\incomeCode\feton\posTrend\negTrend\curPlan\avgplan
#-------------------------------------------------生成训练集
tele_fac<-tele_fac[c(2,3,4,5,6,7,9,10)]
dummy<-dummyVars(~.,data=tele_fac,fullRank=T)
tele_fac<-data.frame(predict(dummy,newdata=tele_fac))
tele_num<-tele_num[-c(4,6)]
data<-cbind(tele_num,tele_fac)
set.seed(2018)
id<-createDataPartition(y=data$churn ,p=0.8,list=F) #按区分层抽20%
train<-data[id,] #glm 训练
train_ridge<-train #ridge 训练
train_lasso<-train #lasso 训练
prop.table(table(train$churn))
test<-data[-id,] #glm检验
test_ridge<-test #ridge检验
test_lasso<-test #lasso检验
prop.table(table(test$churn))
#--------------------------------------建模
m1<-glm(churn~.,family=binomial(link="logit"),data=train)
summary(m1)
#step(m1,direction="both") #2的n次方根本没办法算,直接把没用的去掉
m2<-glm(churn~AGE+duration+peakMinDiff+call_10086+gender.1+edu_class.1+
edu_class.2+edu_class.3+incomeCode.19+incomeCode.36+feton.1+
posTrend.1+negTrend.1,family=binomial(link="logit"),data=train)
summary(m2)
step(m2,direction="both") #没有变量可以再踢
require(car)
vif(m2)
#posTrend="该用户通话时长是否呈现出上升态势:是=1" #negTrend="该用户通话时长是否呈现出下降态势:是=1"
# posTrend.1与 negTrend.1 是有共性性的,----应去掉一个,重复了
m3<-glm(churn~AGE+duration+peakMinDiff+call_10086+gender.1+edu_class.1+
edu_class.2+edu_class.3+incomeCode.19+incomeCode.36+feton.1+
posTrend.1,family=binomial(link="logit"),data=train)
summary(m3) #incomeCode.19 应去掉
step(m3,direction="both") #但STEP的结果表示,incomeCode.19也可以留着,先留着吧
vif(m3) #已无共性性了
#----------------------------------------------用test检验,试下结果
test$pred<-predict(m3,test) #生成测试集的结果
test$p<-1/(1+exp(-test$pred))
test$out<- 1
test[test$p<0.5,]$out<- 0
require(gmodels)
CrossTable(test$out,test$churn,prop.chisq = F,prop.t=F,dnn=c("predicted","actual"))
table(test$out,test$churn)
#----------------------------------------------
require(ROCR)
test_pred<-prediction(test$out,test$churn)
test_perf<-performance(test_pred,"tpr","fpr") # tpr=灵敏度 fpr=1-特异度
train$pred<-predict(m3,train) #生成测试集的结果
train$p<-1/(1+exp(-train$pred))
train$out<- 1
train[train$p<0.5,]$out<- 0
train_pred<-prediction(train$out,train$churn)
train_perf<-performance(train_pred,"tpr","fpr")
plot(test_perf,col="blue",lty=1)
plot(train_perf,col="red",lty=2,add=T) #把train与test放一起看,如果相差不大,模型可用
abline(0,1,lty=2,col="black")
#----------------------------------AUC
train_auc<-round(as.numeric(performance(train_pred,'auc')@y.values),3)
train_auc_str<-paste("Mode_Train_AUC:",train_auc,sep="")
legend(0.3,0.4,c(train_auc_str),2:8)
test_auc<-round(as.numeric(performance(test_pred,'auc')@y.values),3)
test_auc_str<-paste("Mode_Test_AUC:",test_auc,sep="")
legend(0.3,0.2,c(test_auc_str),2:8)
#---------------------------------------------- ridge ----------
x_ridge<-as.matrix(train_ridge[,-c(1)]) #glmnet必须是矩阵
y_ridge<-as.matrix(train_ridge[,1])
require(glmnet)
ridge<-glmnet(x=x_ridge,y=y_ridge,family = "binomial",alpha=0)
plot(ridge,xvar="lambda",label=T)
ridge.cv<-cv.glmnet(x=x_ridge,y=y_ridge,family="binomial",alpha=0,nfold=10)
plot(ridge.cv)
ridge.min<-glmnet(x=x_ridge,y=y_ridge,family="binomial",alpha=0,lambda=ridge.cv$lambda.min)
test_ridge_x<-as.matrix(test_ridge[,-c(1)])
test_ridge_pred <-predict(ridge.min,newx=test_ridge_x)
test_ridge_pred<-as.data.frame(test_ridge_pred)
test_ridge_pred$p<-1/(1+exp(-test_ridge_pred$s0))
test_ridge_pred$out<- 1
test_ridge_pred[test_ridge_pred$p<0.5,]$out<- 0
CrossTable(test_ridge_pred$out,test_ridge$churn,prop.chisq = F,prop.t=F,dnn=c("predicted","actual"))
table(test_ridge_pred$out,test_ridge$churn)
#虽然变量一个没少,但效果提升了很多
pred_ridge_test<-prediction(test_ridge_pred$out,test_lasso$churn)
ridge_perf<-performance(pred_ridge_test,"tpr","fpr")
ridge_auc<-round(as.numeric(performance(pred_ridge_test,'auc')@y.values),3)#算ridge的AUC
#-------------------------------------- lasso ------
x_lasso<-as.matrix(train_lasso[,-c(1)]) #glmnet必须是矩阵
y_lasso<-as.matrix(train_lasso[,1])
lasso<-glmnet(x=x_lasso,y=y_lasso,family = "binomial",alpha=1)
plot(lasso,xvar="lambda",label=T)
lasso.cv<-cv.glmnet(x=x_lasso,y=y_lasso,family="binomial",alpha=1,nfold=10)
plot(lasso.cv)
lasso.min<-glmnet(x=x_lasso,y=y_lasso,family="binomial",alpha=1,lambda=ridge.cv$lambda.min)
coef(lasso.min) #压缩到8个变量
test_lasso_x<-as.matrix(test_lasso[,-c(1)])
test_lasso_pred <-predict(lasso.min,newx=test_lasso_x)
test_lasso_pred<-as.data.frame(test_lasso_pred)
test_lasso_pred$p<-1/(1+exp(-test_lasso_pred$s0))
test_lasso_pred$out<- 1
test_lasso_pred[test_lasso_pred$p<0.5,]$out<- 0
table(test_lasso_pred$out)
CrossTable(test_lasso_pred$out,test_lasso$churn,prop.chisq = F,prop.t=F,dnn=c("predicted","actual"))
table(test_lasso_pred$out,test_lasso$churn)
#效果比GLM好
pred_lasso_test<-prediction(test_lasso_pred$out,test_lasso$churn)
lasso_perf<-performance(pred_lasso_test,"tpr","fpr")
lasso_auc<-round(as.numeric(performance(pred_lasso_test,'auc')@y.values),3)#算lasso的AUC
#---------------------------------------------- AUC比较
vs<-c(test_auc,ridge_auc,lasso_auc) #比较AUC
names(vs)<-c("Logistic-AUC","Ridge-AUC","Lasso-AUC")
print(vs)
#--------------------------结束