ben-老师第三次作业提交

发表: 2018-07-03 浏览: 1491

八大直播作业

# -*- coding: utf-8 -*-

"""

Created on Tue Jul 3 09:01:06 2018

1、背景介绍：

一家婚恋网站公司希望根据已注册用户的历史相亲数据,建立新用户相亲成功可能性的预测模型，数据存放在“date_data2.csv”中。

2、主要变量说明如下：#income-月均收入（元）

#attractive-由婚恋网站评定出的个人魅力值,分值从0-100。

#assets-资产(万元) #edueduclass-教育等级:1=小学,2=初中;3=高中,4=本科,5=硕士及以上

#Dated-是否相亲成功:1代表成功

3、作业安排：3.1 基础知识：

1）比较逻辑回归、决策树、神经网络的算法差异性比较。

3.2 案例解答步骤如下：

1）使用决策树、神经网络建立相亲成功预测模型并通过调节超参数进行模型调优，比较两个模型的优劣。

2)对income,attractive,assets进行分箱(5分箱)处理，用分箱后的数据建模，

并比较与1）步骤中模型的表现是否有差异。

model_selection KFold

cross_validation KFold

两者的差异

@author: 知行合一

"""

import numpy as np

import pandas as pd

import os

from sklearn.model_selection import train_test_split

from sklearn.model_selection import KFold

os.chdir(r'C:\Users\知行合一\Documents\ben\第六讲作业')

# In[4]:

date_data2=pd.read_csv(r'date_data2.csv',index_col=None)

#index=np.arange(0,date_data2.shape[0])

#date_data2.reindex(index)

# In[4]:

#数据预处理，数据探察,准备数据

#date_data2.describe()

X=date_data2.drop('Dated',axis=1)

y=date_data2.Dated

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.4,random_state=1)

# In[4]:

#比较逻辑回归、决策树、神经网络的算法差异性比较。

from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier

from sklearn.neural_network import MLPClassifier

from sklearn import metrics

import matplotlib.pyplot as plt

# In[4]:

#逻辑回归实现预测

logistic=LogisticRegression(penalty='l2',C=0.1 )

decisionTree=DecisionTreeClassifier(criterion='entropy',max_depth=5,

min_samples_split=2,

min_samples_leaf=1,

random_state=12345)

mlp=MLPClassifier(hidden_layer_sizes=(10,),

activation='logistic',

alpha=0.1,

max_iter=100000)

folder =KFold(n_splits=3,shuffle=True,random_state=1)

#y_pred保留其索引位置

#1、交叉验证，实现训练过程

def modelTrain(model,X_train,X_test,y_train):

y_pred=[]

for train_index,validation_index in folder.split(X_train,y_train):

X_t,X_v=X_train.iloc[train_index,:],X_train.iloc[validation_index,:]

y_train=y.iloc[train_index]

model.fit(X_t,y_train)

y_pred.extend(model.predict(X_v))

y_pred_test=model.predict(X_test)

return y_pred,y_pred_test

#1、交叉验证，实现测试验证过程

#对测试集进行对比预测

# In[4]:

#对比分析

y_pred,y_pred_test=modelTrain(logistic,X_train,X_test,y_train)

fpr_test,tpr_test,th_test=metrics.roc_curve(y_test,y_pred_test)

fpr_train,tpr_train,th_train=metrics.roc_curve(y_train,y_pred)

plt.figure(figsize=[6,6])

plt.plot(fpr_test,tpr_test,color='blue')

plt.plot(fpr_train,tpr_train,color='red')

plt.title('Logistic ROC curve')

plt.show()

print('AUC = %6.4f' %metrics.auc(fpr_test, tpr_test))

# In[4]:

#对比分析

y_pred,y_pred_test=modelTrain(decisionTree,X_train,X_test,y_train)

fpr_test,tpr_test,th_test=metrics.roc_curve(y_test,y_pred_test)

fpr_train,tpr_train,th_train=metrics.roc_curve(y_train,y_pred)

plt.figure(figsize=[6,6])

plt.plot(fpr_test,tpr_test,color='blue')

plt.plot(fpr_train,tpr_train,color='red')

plt.title('DecisionTreeClassifier ROC curve')

plt.show()

print('AUC = %6.4f' %metrics.auc(fpr_test, tpr_test))

# In[4]:

#对比分析

y_pred,y_pred_test=modelTrain(mlp,X_train,X_test,y_train)

fpr_test,tpr_test,th_test=metrics.roc_curve(y_test,y_pred_test)

fpr_train,tpr_train,th_train=metrics.roc_curve(y_train,y_pred)

plt.figure(figsize=[6,6])

plt.plot(fpr_test,tpr_test,color='blue')

plt.plot(fpr_train,tpr_train,color='red')

plt.title('MLPclassifier ROC curve')

plt.show()

print('AUC = %6.4f' %metrics.auc(fpr_test, tpr_test))

# In[4]:

#模型参数对比

from sklearn.model_selection import GridSearchCV

param_grid = {

'hidden_layer_sizes':[ (5, 5),(3,3),(2,2),(2,6)],

'activation':['logistic', 'tanh', 'relu'],

'alpha':[ 0.2, 0.4, 1, 10,12,14,16,18]

}

mlp = MLPClassifier(max_iter=1000)

gcv = GridSearchCV(estimator=mlp, param_grid=param_grid,

scoring='roc_auc', cv=4)

gcv.fit(X_train,y_train)

# In[4]:

gcv.best_params_

# In[4]:

param_grid={

'criterion':['entropy','gini'],

'max_depth':[5,6,7,8,9,10],

'min_samples_split':[2,3,4]

}

clf=DecisionTreeClassifier()

clfcv=GridSearchCV(estimator=clf,param_grid=param_grid,

scoring='roc_auc',cv=4)

clfcv.fit(X_train,y_train)

# In[4]:

clfcv.best_params_

# In[4]:

#模型对比选择之后，再此验证比较效果

decisionTree=DecisionTreeClassifier(criterion='entropy',max_depth=10,min_samples_split=2)

mlp=MLPClassifier(hidden_layer_sizes=(5,5),activation='tanh',alpha=14)

y_pred,y_pred_test=modelTrain(decisionTree,X_train,X_test,y_train)

fpr_test,tpr_test,th_test=metrics.roc_curve(y_test,y_pred_test)

fpr_train,tpr_train,th_train=metrics.roc_curve(y_train,y_pred)

plt.figure(figsize=[6,6])

plt.plot(fpr_test,tpr_test,color='blue')

plt.plot(fpr_train,tpr_train,color='red')

plt.title('MLPclassifier ROC curve')

plt.show()

print('AUC = %6.4f' %metrics.auc(fpr_test, tpr_test))

y_pred,y_pred_test=modelTrain(mlp,X_train,X_test,y_train)

fpr_test,tpr_test,th_test=metrics.roc_curve(y_test,y_pred_test)

fpr_train,tpr_train,th_train=metrics.roc_curve(y_train,y_pred)

plt.figure(figsize=[6,6])

plt.plot(fpr_test,tpr_test,color='blue')

plt.plot(fpr_train,tpr_train,color='red')

plt.title('MLPclassifier ROC curve')

plt.show()

print('AUC = %6.4f' %metrics.auc(fpr_test, tpr_test))

#2)对income,attractive,assets进行分箱(5分箱)处理，用分箱后的数据建模，

#假设要将某个自变量的观测值分为k个分箱，一些常用的分箱方法有：

#1.无监督分箱（1）等宽分箱：

#将变量的取值范围分为k个等宽的区间，每个区间当作一个分箱。

#（2）等频分箱：把观测值按照从小到大的顺序排列，根据观测的个数等分为k部分，每部分当作一个分箱，

#例如，数值最小的1/k比例的观测形成第一个分箱，等等。

#（3）基于k均值聚类的分箱：使用第五章将介绍的k均值聚类法将观测值聚为k类，但在聚类过程中需要保证分箱的有序性：

#第一个分箱中所有观测值都要小于第二个分箱中的观测值，第二个分箱中所有观测值都要小于第三个分箱中的观测值，等等。

#2.有监督分箱

#在分箱时考虑因变量的取值，使得分箱后达到最小熵（minimumentropy）或最小描述长度（minimumdescriptionlength）。这里仅介绍最小熵。

#（1）假设因变量为分类变量，可取值1，…，J。令pl（j）表示第l个分箱内因变量取值为j的观测的比例，

#l=1，…，k，j=1，…，J；那么第l个分箱的熵值为Jj=1［-pl（j）×log（pl（j））］。

#如果第l个分箱内因变量各类别的比例相等，即pl（1）=…=pl（J）=1/J，那么第l个分箱的熵值达到最大值；

#如果第l个分箱内因变量只有一种取值，即某个pl（j）等于1而其他类别的比例等于0，那么第l个分箱的熵值达到最小值。

#（2）令rl表示第l个分箱的观测数占所有观测数的比例；

#那么总熵值为kl= 1rl×Jj=1［-pl（j）×log（pl（j ））］。

#需要使总熵值达到最小，也就是使分箱能够最大限度地区分因变量的各类别。

0 个评论

要回复文章请先登录或注册