作业要求:
1、背景介绍:
一家婚恋网站公司希望根据已注册用户的历史相亲数据,建立新用户相亲成功可能性的预测模型,数据存放在“date_data2.csv”中。
2、主要变量说明如下:
#income-月均收入(元)
#attractive-由婚恋网站评定出的个人魅力值,分值从0-100。
#assets-资产(万元)
#edueduclass-教育等级:1=小学,2=初中;3=高中,4=本科,5=硕士及以上
#Dated-是否相亲成功:1代表成功
3、作业安排:
3.1 基础知识:
1)比较逻辑回归、决策树、神经网络的算法差异性比较。
3.2 案例解答步骤如下:
1)使用决策树、神经网络建立相亲成功预测模型并通过调节超参数进行模型调优,比较两个模型的优劣。
2)对income,attractive,assets进行分箱(5分箱)处理,用分箱后的数据建模,并比较与1)步骤中模型的表现是否有差异。
# coding: utf-8
# In[68]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.feature_selection as feature_selection
import sklearn.model_selection as cross_validation
import sklearn.metrics as metrics
# In[16]:
os.chdir(r'E:\数据分析\天善-PYTHON\八场直播,八大案例(金融风控)\第6讲\提交-第六讲:使用决策树和神经网络预测客户流失倾向\作业')
data = pd.read_csv(r'E:\数据分析\天善-PYTHON\八场直播,八大案例(金融风控)\第6讲\提交-第六讲:使用决策树和神经网络预测客户流失倾向\作业\date_data2.csv',encoding = 'gbk')
data['income_new'] = data['income']/10000
data.head()
# In[11]:
# In[18]:
#决策树
#将连续变量分箱
data['income_bin'] = pd.qcut(data.income_new,5)
data['attractive_bin'] = pd.qcut(data.attractive,5)
data['assets_bin'] = pd.qcut(data.assets,5)
data['income_new'].astype('int64').groupby(data['income_bin']).agg(['count', 'mean'])
# In[48]:
#卡方检验
feature_selection.chi2(data[[ 'income_new','attractive','assets','edueduclass']], data['Dated'])
# In[58]:
#划分训练集
target = data['Dated']
model_data = data[['income_new','attractive','assets','edueduclass']]
train_data,test_data,train_target,test_target = cross_validation.train_test_split(model_data,target,test_size = 0.4,train_size = 0.6,random_state =12345)
# In[60]:
# 选择决策树进行建模
import sklearn.tree as tree
clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=10)
clf.fit(train_data, train_target)
# In[69]:
# 查看模型预测结果
train_set = clf.predict(train_data)
train_set_p = clf.predict_proba(train_data)[:,1]
test_set = clf.predict(test_data)
test_set_p = clf.predict_proba(test_data)[:,1]
pd.DataFrame({'test_target':test_target,'test_set':test_set,'test_set_p':test_set_p})
# In[70]:
print(metrics.confusion_matrix(test_target, test_set,labels=[0,1])) # 混淆矩阵
print(metrics.classification_report(test_target, test_set)) # 计算评估指标
print(pd.DataFrame(list(zip(data.columns, clf.feature_importances_)))) # 变量重要性指标
# In[72]:
red, blue = sns.color_palette("Set1", 2)
sns.distplot(test_set_p[test_target == 1], kde=False, bins=15, color=red)
sns.distplot(test_set_p[test_target == 0], kde=False, bins=15,color=blue)
plt.show()
# In[73]:
fpr_test, tpr_test, th_test = metrics.roc_curve(test_target, test_set_p)
fpr_train, tpr_train, th_train = metrics.roc_curve(train_target, train_set_p)
plt.figure(figsize=[6,6])
plt.plot(fpr_test, tpr_test, color=blue)
plt.plot(fpr_train, tpr_train, color=red)
plt.title('ROC curve')
plt.show()
# In[74]:
#参数调优
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
# In[80]:
param_grid = {
'max_depth':[2,3,4,5,7,9,10],
'min_samples_split':[4,8,12,16,20,24,28,34]
}
clf = tree.DecisionTreeClassifier(criterion='entropy')
clfcv = GridSearchCV(estimator=clf, param_grid=param_grid,
scoring='roc_auc', cv=4)
clfcv.fit(train_data, train_target)
# In[81]:
train_set = clfcv.predict(train_data)
train_set_p=clfcv.predict_proba(train_data)[:,1]
test_set=clfcv.predict(test_data)
test_set_p=clfcv.predict_proba(test_data)[:,1]
fpr_test, tpr_test, th_test = metrics.roc_curve(test_target, test_set_p)
fpr_train, tpr_train, th_train = metrics.roc_curve(train_target, train_set_p)
plt.figure(figsize=[6,6])
plt.plot(fpr_test, tpr_test, color=blue)
plt.plot(fpr_train, tpr_train, color=red)
plt.title('ROC curve')
plt.show()
# In[91]:
clfcv.best_params_
clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_split=4) # 当前支持计算信息增益和GINI
clf.fit(train_data, train_target) # 使用训练数据建模
# In[92]:
import pydotplus
from IPython.display import Image
import sklearn.tree as tree
# In[93]:
dot_data = tree.export_graphviz(
clf,
out_file=None,
feature_names=train_data.columns,
max_depth=3,
class_names=['0','1'],
filled=True
)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
# In[94]:
#神经网络
from sklearn.model_selection import train_test_split
# In[101]:
data_new = data[['income','attractive','assets','edueduclass']]
target = data['Dated']
train_data,test_data,train_target,test_target = train_test_split(data_new,target,test_size = 0.4,train_size = 0.6,random_state = 123)
# In[126]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(train_data)
scaled_train_data = scaler.transform(train_data)
scaled_test_data = scaler.transform(test_data)
train_data.head()
# In[141]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(10,),
activation='logistic', alpha=0.1, max_iter=1000)
mlp.fit(scaled_train_data, train_target)
mlp
# In[147]:
train_predict = mlp.predict(scaled_train_data)
test_predict = mlp.predict(scaled_test_data)
train_proba = mlp.predict_proba(scaled_train_data)[:, 1]
test_proba = mlp.predict_proba(scaled_test_data)[:, 1]
# In[150]:
from sklearn import metrics
print(metrics.confusion_matrix(test_target, test_predict, labels=[0, 1]))
print(metrics.classification_report(test_target, test_predict))
# In[151]:
fpr_test, tpr_test, th_test = metrics.roc_curve(test_target, test_proba)
fpr_train, tpr_train, th_train = metrics.roc_curve(train_target, train_proba)
plt.figure(figsize=[4, 4])
plt.plot(fpr_test, tpr_test, '')
plt.plot(fpr_train, tpr_train, '')
plt.title('ROC curve')
plt.show()
print('AUC = %6.4f' %metrics.auc(fpr_test, tpr_test))