老师:先用CART100%正确,后用C50时怕不准就用CARET训练了参数trials=10,预测结果也是100%,再回到trials=1还是100%正确,晕,是不是我哪错了,还用分组吗?
#income-月均收入(元)
#attractive-由婚恋网站评定出的个人魅力值,分值从0-100。
#assets-资产(万元)
#edueduclass-教育等级:1=小学,2=初中;3=高中,4=本科,5=硕士及以上
#Dated-是否相亲成功:1代表成功
#1)使用决策树、神经网络建立相亲成功预测模型并通过调节超参数进行模型调优,比较两个模型的优劣。
#2)对income,attractive,assets进行分箱(5分箱)处理,用分箱后的数据建模,并比较与1)步骤中模型的表现是否有差异。
require(rpart.plot)
require(caret)
require(rpart)
require(C50)
setwd('E:/finance_R')
data<-read.csv("date_data2.csv")
apply(data,2,summary) #没什么极端值
which(!complete.cases(data)) #无缺失
hist(data$income,nclass=40) #右偏
data$income_log<-log(data$income) #转成log
hist(data$income_log,nclass=40) #有点正态的意思
hist(data$attractive,nclass=30) #看图还行,一定有偏差,中间断了几行
hist(data$assets,nclass=30) ##右偏
data$assets_log<-log(data$assets) #转成log
hist(data$assets_log,nclass=30) #有点正态的意思
data$edueduclass<-as.factor(data$edueduclass) #转因子
data$Dated<-as.factor(data$Dated) #转因子
table(data$Dated) #50:50
set.seed(2018)
id<-createDataPartition(y=data$Dated,p=0.8,list=F)
train<-data[id,]
test<-data[-id,]
#------------------------用取LOG的看下
tc<-rpart.control(minsplit = 1,minbucket = 1,maxdepth = 10,xval=5,cp=0.01)
rpart.mod<-rpart(Dated~attractive+edueduclass+income_log+assets_log,data=train,method="class",
parms=list(split="gini"),control=tc)
rpart.plot(rpart.mod,branch=1,extra=106,under=T,faclen=0,cex=0.8)
#-----------------------不取LOG-----因为决策树对LOG没用,都是哪个点--这样看还好解释点
tc<-rpart.control(minsplit = 1,minbucket = 1,maxdepth = 10,xval=5,cp=0.001)
rpart.mod<-rpart(Dated~attractive+edueduclass+income+assets,data=train,method="class",
parms=list(split="gini"),control=tc)
rpart.plot(rpart.mod,branch=1,extra=106,under=T,faclen=0,cex=0.8)
#-----------------------------------选CP,0.0125比较合适
rpart.mod$variable.importance #edueduclass 20.66251第二名,但图上却没有
rpart.mod$cp #0.012500000合适
tc<-rpart.control(minsplit = 1,minbucket = 1,maxdepth = 10,xval=5,cp=0.0125)
rpart.mod<-rpart(Dated~attractive+edueduclass+income+assets,data=train,method="class",
parms=list(split="gini"),control=tc)
rpart.plot(rpart.mod,branch=1,extra=106,under=T,faclen=0,cex=0.8)
#-------------操用CART,全对----------是真的吗?
pre<-predict(rpart.mod,newdata=test,type="class")
table(pre,test$Dated)
#--------------------------------C50----------
tc<-C5.0Control(CF=0.99,winnow = F,noGlobalPruning = T,minCases = 1)
model<-C5.0(Dated~attractive+edueduclass+income_log+assets_log,data=train,trials=1,rules=F,control=tc)
plot(model) #log的,rules=T做不出来图
tc<-C5.0Control(CF=0.99,winnow = F,noGlobalPruning = T,minCases = 1)
model<-C5.0(Dated~attractive+edueduclass+income+assets,data=train,trials=1,rules=F,control=tc)
plot(model) #原始值,不用LOG的,看出来差异不大
ctrl=trainControl(method="cv",number=5,selectionFunction="oneSE")
grid=expand.grid(.model="tree",.trials=seq(1,50),.winnow=FALSE)
m<-train(Dated~attractive+edueduclass+income+assets,data=train,method="C5.0",metric="Kappa",trControl=ctrl,tuneGrid=grid)
m #The final values used for the model were trials = 10, model = tree and winnow = FALSE
model_2<-C5.0(Dated~attractive+edueduclass+income+assets,model = "tree",trials = 10,data=train,rules=F,control=tc)
summary(model_2)
C5imp(model_2) #组合模型的变量重要性 要比一个树的要多
p<-predict(model_2,newdata=test) #看0,1
table(p,test$Dated) #操,也是100%准确
#------------------------------------------------------------------------