第四次作业-婚恋预测

浏览: 985
#%%
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import os
os.chdir(r'D:\give_me_five\githome\DataAnalysis\Case5')

matplotlib.rcParams['axes.unicode_minus']=False#解决保存图像时负号'-'显示为方块的问题
plt.rcParams['font.sans-serif'] = ['SimHei']#指定默认字体
#%%
dat1 = pd.read_csv('./dataset/date_data2.csv')# 读取数据
dat1.head()
#%%
# 比例正好1:1
dat1.Dated.value_counts()
#%%
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve

# 拆分数据集
dat1_x = dat1.drop(['Dated'],axis=1)
dat1_y = dat1.Dated
X_train,X_test,y_train,y_test = train_test_split(dat1_x,dat1_y,test_size=0.4,random_state=1024)
#%%

from sklearn.model_selection import GridSearchCV
# 使用网格搜索,交叉验证,寻找最优超参数
model_dt = DecisionTreeClassifier()
params={'criterion':['gini','entropy'],'max_depth':[2,3,4,5,6,7,8],
'min_samples_split':[2,3,4,5,10,20,30]}
gridsearch = GridSearchCV(model_dt,params,scoring='roc_auc',cv=3)
gridsearch.fit(X_train,y_train)
# 最优超参数是{'criterion': 'gini', 'max_depth': 2, 'min_samples_split': 2}
gridsearch.best_params_
#%%
# 使用最优超参数建模
model_dt = DecisionTreeClassifier(criterion='entropy',max_depth=3,min_samples_split=2)
model_dt.fit(X_train,y_train)
#%%
# 使用模型预测训练集和测试集数据
y_train_predict = model_dt.predict(X_train)# 用模型预测训练集的结果
y_train_predict_p = model_dt.predict_proba(X_train)[:,1]#用模型预测训练集的概率
y_test_predict = model_dt.predict(X_test)#用模型预测测试集的结果
y_test_predict_p = model_dt.predict_proba(X_test)[:,1]#用模型预测测试集的概率
#%%
# 查看训练集和测试集的roc 曲线
# 训练集的roc曲线,比测试集roc曲线面积大一点,且完全包住,有点过拟合,数量太少
fpr_test, tpr_test, th_test = roc_curve(y_test, y_test_predict_p)
fpr_train, tpr_train, th_train = roc_curve(y_train, y_train_predict_p)
plt.figure(figsize=[6,6])
plt.plot(fpr_test, tpr_test, color='blue',label='test')
plt.plot(fpr_train, tpr_train, color='red',label='train')
plt.legend()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(' Roc Curve')
plt.show()
#%%
import pydotplus
from IPython.display import Image
import sklearn.tree as tree

# 画出决策树的每一层 看出assets重要性较高
dot_data = tree.export_graphviz(
model_dt,
out_file=None,
feature_names=dat1_x.columns,
max_depth=3,
class_names=['0','1'],
filled=True
)

graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
#%%

# 神经网络
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler

# 极差标准化 将数值型特征列的取值范围 变化成统一数量级
mmScaler = MinMaxScaler()
scaled_X_train = mmScaler.fit_transform(X_train)
scaled_X_test = mmScaler.fit_transform(X_test)
#%%
# 使用网格搜索,交叉验证,寻找最优参数
param_grid = {
'hidden_layer_sizes':[(10, ), (15, ), (20, ), (5, 5)],
'activation':['logistic', 'tanh', 'relu'],
'alpha':[0.001, 0.01, 0.1, 0.2, 0.4, 1, 10],
'max_iter':[200,500,600,1000]
}
model_mlp = MLPClassifier()
gridsearch = GridSearchCV(estimator=model_mlp, param_grid=param_grid,
scoring='roc_auc', cv=3, n_jobs=-1)
gridsearch.fit(scaled_X_train, y_train)
#%%
# 最优参数是{'activation': 'relu','alpha': 0.4,'hidden_layer_sizes': (5,5),'max_iter': 1000}
gridsearch.best_params_
#%%
# 使用最优参数建模
model_mlp = MLPClassifier(activation='relu',
alpha=0.4,hidden_layer_sizes=(5,5),max_iter=1000)
model_mlp.fit(scaled_X_train,y_train)
#%%
# 使用模型预测训练集和测试集
y_train_predict = model_mlp.predict(scaled_X_train) # 测试集结果
y_train_predict_p = model_mlp.predict_proba(scaled_X_train)[:,1]# 训练集概率
y_test_predict = model_mlp.predict(scaled_X_test) # 测试集结果
y_test_predict_p = model_mlp.predict_proba(scaled_X_test)[:,1]# 测试集概率
#%%
# 查看训练集和测试集的roc 曲线
# 训练集的roc曲线,和测试集 roc曲线相差不大
fpr_test, tpr_test, th_test = roc_curve(y_test, y_test_predict_p)
fpr_train, tpr_train, th_train = roc_curve(y_train, y_train_predict_p)
plt.figure(figsize=[6,6])
plt.plot(fpr_test, tpr_test, color='blue',label='test')
plt.plot(fpr_train, tpr_train, color='red',label='train')
plt.legend()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(' Roc Curve')
plt.show()
#%%

# 使用分箱
# income [0,4500,6500,9000,11800]
dat1['income_bins'] = pd.qcut(dat1.income,5)
# attrative [0,21,39,65,79]
dat1['attractive_bins'] = pd.qcut(dat1.attractive,5)
# assets [0,25,52,94,145]
dat1['assets_bins'] = pd.qcut(dat1.assets,5)
#%%
income_bins = [0,4500,6500,9000,11800,34000]
dat1['income_bins'] = pd.cut(dat1.income,bins=income_bins)
attractive_bins = [0,21,39,65,79,100]
dat1['attractive_bins'] = pd.cut(dat1.attractive,bins=attractive_bins)
assets_bins = [0,25,52,94,145,486]
dat1['assets_bins'] = pd.cut(dat1.assets,bins=assets_bins)
#%%
# 计算column中每一类的WOE值和column的IV值
# 返回字典,其中WOE_dict表示WOE编码表
# IV表示column的IV值

from pandas import DataFrame

def Func_CalcWOE(df,col,target):
total = df.groupby([col])[target].count()
total = DataFrame({'total':total})
bad = df.groupby([col])[target].sum()
bad = DataFrame({'bad':bad})
regroup = total.merge(bad,left_index=True,right_index=True,how='left')
regroup['good'] = regroup.total - regroup.bad
N = regroup.total.sum()
B = regroup.bad.sum()
G = N - B
regroup['bad_pcnt'] = regroup.bad.map(lambda x:x/B)
regroup['good_pcnt'] = regroup.good.map(lambda x:x/G)
regroup['woe'] = regroup.apply(lambda x:np.log(x.good_pcnt/x.bad_pcnt),axis=1)
WOE_dict = regroup['woe'].to_dict()
IV = regroup.apply(lambda x:(x.good_pcnt - x.bad_pcnt)*x.woe,axis=1)
IV = sum(IV)
return {'WOE':WOE_dict,'IV':IV}
#%%
# income_bins IV 值1.33
woe = Func_CalcWOE(dat1,'income_bins','Dated')['WOE']
dat1.income_bins = dat1.income_bins.map(lambda x:woe.get(x)).astype('float64')
#%%
# attractive IV 值 0.359
woe = Func_CalcWOE(dat1,'attractive_bins','Dated')['WOE']
dat1.attractive_bins = dat1.attractive_bins.map(lambda x:woe.get(x)).astype('float64')
#%%
# assets 特征列比较特殊,0和1的集中度较高,使用婚恋成功率来表示
dat2 = pd.crosstab(dat1.assets_bins,dat1.Dated)
dat2 = dat2.div(dat2.sum(1),axis=0)
assets_dict = dat2[1].to_dict()
dat1.assets_bins = dat1.assets_bins.map(lambda x:assets_dict.get(x)).astype('float64')
#%%
# 拆分数据集
datx_bins = dat1.iloc[:,-3:]
daty_bins = dat1.Dated
X_train,X_test,y_train,y_test = train_test_split(datx_bins,daty_bins,test_size=0.4,random_state=1024)

#%%
# 使用决策树建模
from sklearn.model_selection import GridSearchCV
# 使用网格搜索,交叉验证,寻找最优超参数
model_dt = DecisionTreeClassifier()
params={'criterion':['gini','entropy'],'max_depth':[2,3,4,5,6,7,8],
'min_samples_split':[2,3,4,5,10,20,30]}
gridsearch = GridSearchCV(model_dt,params,scoring='roc_auc',cv=3)
gridsearch.fit(X_train,y_train)
# 最优超参数是{'criterion': 'gini', 'max_depth': 2, 'min_samples_split': 2}
gridsearch.best_params_
#%%
# 使用最优超参数建模
model_dt = DecisionTreeClassifier(criterion='entropy',max_depth=3,min_samples_split=2)
model_dt.fit(X_train,y_train)
#%%
# 使用模型预测训练集和测试集数据
y_train_predict = model_dt.predict(X_train)# 用模型预测训练集的结果
y_train_predict_p = model_dt.predict_proba(X_train)[:,1]#用模型预测训练集的概率
y_test_predict = model_dt.predict(X_test)#用模型预测测试集的结果
y_test_predict_p = model_dt.predict_proba(X_test)[:,1]#用模型预测测试集的概率
#%%
# 查看训练集和测试集的roc 曲线
# 训练集的roc曲线 和测试集的roc曲线相比 效果好很多,存在过拟合
fpr_test, tpr_test, th_test = roc_curve(y_test, y_test_predict_p)
fpr_train, tpr_train, th_train = roc_curve(y_train, y_train_predict_p)
plt.figure(figsize=[6,6])
plt.plot(fpr_test, tpr_test, color='blue',label='test')
plt.plot(fpr_train, tpr_train, color='red',label='train')
plt.legend()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(' Roc Curve')
plt.show()
#%%
# 神经网络
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler

# 极差标准化 将数值型特征列的取值范围 变化成统一数量级
mmScaler = MinMaxScaler()
scaled_X_train = mmScaler.fit_transform(X_train)
scaled_X_test = mmScaler.fit_transform(X_test)
#%%
# 使用网格搜索,交叉验证,寻找最优参数
param_grid = {
'hidden_layer_sizes':[(10, ), (15, ), (20, ), (5, 5)],
'activation':['logistic', 'tanh', 'relu'],
'alpha':[0.001, 0.01, 0.1, 0.2, 0.4, 1, 10],
'max_iter':[200,500,600,1000]
}
model_mlp = MLPClassifier()
gridsearch = GridSearchCV(estimator=model_mlp, param_grid=param_grid,
scoring='roc_auc', cv=3, n_jobs=-1)
gridsearch.fit(scaled_X_train, y_train)
#%%
# 最优参数是{'activation': 'tanh','alpha': 0.001,'hidden_layer_sizes': (5,5),'max_iter': 600}
gridsearch.best_params_
#%%
# 使用最优参数建模
model_mlp = MLPClassifier(activation='tanh',
alpha=0.001,hidden_layer_sizes=(5,5),max_iter=600)
model_mlp.fit(scaled_X_train,y_train)
#%%
# 使用模型预测训练集和测试集
y_train_predict = model_mlp.predict(scaled_X_train) # 测试集结果
y_train_predict_p = model_mlp.predict_proba(scaled_X_train)[:,1]# 训练集概率
y_test_predict = model_mlp.predict(scaled_X_test) # 测试集结果
y_test_predict_p = model_mlp.predict_proba(scaled_X_test)[:,1]# 测试集概率
#%%
# 查看训练集和测试集的roc 曲线
# 训练集的roc曲线 和 测试集roc曲线 有差异
fpr_test, tpr_test, th_test = roc_curve(y_test, y_test_predict_p)
fpr_train, tpr_train, th_train = roc_curve(y_train, y_train_predict_p)
plt.figure(figsize=[6,6])
plt.plot(fpr_test, tpr_test, color='blue',label='test')
plt.plot(fpr_train, tpr_train, color='red',label='train')
plt.legend()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(' Roc Curve')
plt.show()

推荐 1
本文由 keikeiML 创作,采用 知识共享署名-相同方式共享 3.0 中国大陆许可协议 进行许可。
转载、引用前需联系作者,并署名作者且注明文章出处。
本站文章版权归原作者及原出处所有 。内容为作者个人观点, 并不代表本站赞同其观点和对其真实性负责。本站是一个个人学习交流的平台,并不用于任何商业目的,如果有任何问题,请及时联系我们,我们将根据著作权人的要求,立即更正或者删除有关内容。本站拥有对此声明的最终解释权。

0 个评论

要回复文章请先登录注册