本站文章版权归原作者及原出处所有 。内容为作者个人观点, 并不代表本站赞同其观点和对其真实性负责。本站是一个个人学习交流的平台,并不用于任何商业目的,如果有任何问题,请及时联系我们,我们将根据著作权人的要求,立即更正或者删除有关内容。本站拥有对此声明的最终解释权。
4 个评论
#%%
import os
import pandas as pd
os.chdir(r'D:\Learningfile\天善学院\280_Ben_八大直播八大案例配套课件\提交-第七讲:使用集成算法建立个人银行反欺诈模型\作业')
data=pd.read_excel('FRAUD_TRAIN_Samp.xlsx')
#%%
data.target.value_counts()
#%%
data.F81.value_counts()
#%%
data.F74.value_counts()
#%%
data.F75.value_counts()
#%%
data.F80.value_counts()
#%%
import numpy as np
dict_F81={'无等级':0,
np.nan:0,
'平衡':1,
'稳健':1,
'成长':1,
'进取':1,
'保守':1}
data['F81']=data['F81'].map(dict_F81)
#%%
dict_F75={'M':1,
'F':0,
np.nan:2}
data.F75=data.F75.map(dict_F75)
#%%缺失值处理
data1=data.copy()
data1=data1.drop(['csr_id','F74','F80'],axis=1)
data1=data1.fillna(0)
#%%
from sklearn.cross_validation import train_test_split
data_X=data1.iloc[:,1:]
data_y=data1.iloc[:,0]
train_X,test_X,train_y,test_y=train_test_split(data_X,data_y,test_size=0.4, train_size=0.6, random_state=12345)
#%%
#采样
from imblearn.combine import SMOTETomek
kos = SMOTETomek(random_state=0) # 综合采样
X_kos, y_kos = kos.fit_sample(train_X, train_y)
#%%
#决策树建模
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
clf = DecisionTreeClassifier(criterion='gini', random_state=1234)
param_grid = {'max_depth':[4,12,13,14], 'max_leaf_nodes':[4,12,30,31,32]}
cv = GridSearchCV(clf, param_grid=param_grid, scoring='f1')
cv.fit(X_kos, y_kos)
predict_test = cv.predict(test_X)
test_proba = cv.predict_proba(test_X)[:, 1]
#%%
from sklearn import metrics
fpr_test, tpr_test, th_test = metrics.roc_curve(test_y, test_proba)
print('AUC = %6.4f' %metrics.auc(fpr_test, tpr_test))
#print(metrics.classification_report(test_y, predict_test))
#%%
cv.best_params_
#%%
#神经网络建模
#标准化
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(train_X)
scaled_train_data = scaler.transform(train_X)
scaled_test_data = scaler.transform(test_X)
# In[5]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(10,),
activation='logistic', alpha=0.1, max_iter=1000)
mlp.fit(scaled_train_data, train_y)
mlp
#%%
predict_test2 = mlp.predict(scaled_test_data)
test_proba2 = mlp.predict_proba(scaled_test_data)[:, 1]
from sklearn import metrics
fpr_test2, tpr_test2, th_test2 = metrics.roc_curve(test_y, test_proba2)
print('AUC = %6.4f' %metrics.auc(fpr_test2, tpr_test2))
#print(metrics.classification_report(test_y, predict_test2))
#%%
##RF建模
import sklearn.ensemble as ensemble
param_grid = {
'criterion':['entropy','gini'],
'max_depth':[4,5,6,7],
'n_estimators':[24,26,28], #决策树个数-随机森林特有参数
'max_features':[0.02,0.04,0.4], #每棵决策树使用的变量占比-随机森林特有参数
'min_samples_split':[74,75,78]
}
rfc = ensemble.RandomForestClassifier()
rfccv = GridSearchCV(estimator=rfc, param_grid=param_grid, scoring='roc_auc', cv=4,n_jobs=-1)
rfccv.fit(train_X, train_y)
test_est3 = rfccv.predict(test_X)
fpr_test3,tpr_test3,th_test3=metrics.roc_curve(test_y,test_est3)
print('decision tree ROC')
print('ROC=%.4f' %metrics.auc(fpr_test3,tpr_test3))
#%%
rfccv.best_params_
#%%
#Adaboost算法
param_grid = {
#'base_estimator':['DecisionTreeClassifier'],
'learning_rate':[0.1,0.3,0.5,0.7,1]
}
abc = ensemble.AdaBoostClassifier(n_estimators=100,algorithm='SAMME')
abccv = GridSearchCV(estimator=abc, param_grid=param_grid, scoring='roc_auc', cv=4,n_jobs=-1)
abccv.fit(train_X, train_y)
test_est4 = abccv.predict(test_X)
print("abc classifier accuracy:")
print(metrics.classification_report(test_y,test_est4))
fpr_test4,tpr_test4,th_test4=metrics.roc_curve(test_y,test_est4)
print('decision tree ROC')
print('ROC=%.4f' %metrics.auc(fpr_test4,tpr_test4))
#%%
abccv.best_params_
# In[ ]:
#GBDT
param_grid = {
'loss':['deviance','exponential'],
'learning_rate':[0.1,0.3,0.8],
'n_estimators':[26,28,30,32], #决策树个数-GBDT特有参数
'max_depth':[4,5,6], #单棵树最大深度-GBDT特有参数
'min_samples_split':[2,15,20,25]
}
gbc = ensemble.GradientBoostingClassifier()
gbccv = GridSearchCV(estimator=gbc, param_grid=param_grid, scoring='roc_auc', cv=4,n_jobs=-1)
gbccv.fit(train_X, train_y)
test_est5 = gbccv.predict(test_X)
print("gradient boosting accuracy:")
print(metrics.classification_report(test_y,test_est5))
fpr_test5,tpr_test5,th_test5=metrics.roc_curve(test_y,test_est5)
print('decision tree ROC')
print('ROC=%.4f' %metrics.auc(fpr_test5,tpr_test5))
#%%
gbccv.best_params_
#%%#GBDT
param_grid = {
'loss':['deviance','exponential'],
'learning_rate':[0.05,0.1,0.2,0.8],
'n_estimators':[26,30,70,80,90], #决策树个数-GBDT特有参数
'max_depth':[3,4,5], #单棵树最大深度-GBDT特有参数
'min_samples_split':[2,3]
}
gbc = ensemble.GradientBoostingClassifier(random_state=12)
gbccv = GridSearchCV(estimator=gbc, param_grid=param_grid, scoring='roc_auc', cv=4,n_jobs=-1)
gbccv.fit(train_X, train_y)
test_est5 = gbccv.predict(test_X)
print("gradient boosting accuracy:")
print(metrics.classification_report(test_y,test_est5))
fpr_test5,tpr_test5,th_test5=metrics.roc_curve(test_y,test_est5)
print('decision tree ROC')
print('ROC=%.4f' %metrics.auc(fpr_test5,tpr_test5))
gbccv.best_params_
#%%
#PR曲线
import matplotlib.pyplot as plt
plt.figure(figsize=[6,6])
plt.plot(tpr_test2,1-fpr_test2)
plt.ylabel('recall')
plt.xlabel('precisoin')
plt.title('pr curve')
plt.show()
import os
import pandas as pd
os.chdir(r'D:\Learningfile\天善学院\280_Ben_八大直播八大案例配套课件\提交-第七讲:使用集成算法建立个人银行反欺诈模型\作业')
data=pd.read_excel('FRAUD_TRAIN_Samp.xlsx')
#%%
data.target.value_counts()
#%%
data.F81.value_counts()
#%%
data.F74.value_counts()
#%%
data.F75.value_counts()
#%%
data.F80.value_counts()
#%%
import numpy as np
dict_F81={'无等级':0,
np.nan:0,
'平衡':1,
'稳健':1,
'成长':1,
'进取':1,
'保守':1}
data['F81']=data['F81'].map(dict_F81)
#%%
dict_F75={'M':1,
'F':0,
np.nan:2}
data.F75=data.F75.map(dict_F75)
#%%缺失值处理
data1=data.copy()
data1=data1.drop(['csr_id','F74','F80'],axis=1)
data1=data1.fillna(0)
#%%
from sklearn.cross_validation import train_test_split
data_X=data1.iloc[:,1:]
data_y=data1.iloc[:,0]
train_X,test_X,train_y,test_y=train_test_split(data_X,data_y,test_size=0.4, train_size=0.6, random_state=12345)
#%%
#采样
from imblearn.combine import SMOTETomek
kos = SMOTETomek(random_state=0) # 综合采样
X_kos, y_kos = kos.fit_sample(train_X, train_y)
#%%
#决策树建模
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
clf = DecisionTreeClassifier(criterion='gini', random_state=1234)
param_grid = {'max_depth':[4,12,13,14], 'max_leaf_nodes':[4,12,30,31,32]}
cv = GridSearchCV(clf, param_grid=param_grid, scoring='f1')
cv.fit(X_kos, y_kos)
predict_test = cv.predict(test_X)
test_proba = cv.predict_proba(test_X)[:, 1]
#%%
from sklearn import metrics
fpr_test, tpr_test, th_test = metrics.roc_curve(test_y, test_proba)
print('AUC = %6.4f' %metrics.auc(fpr_test, tpr_test))
#print(metrics.classification_report(test_y, predict_test))
#%%
cv.best_params_
#%%
#神经网络建模
#标准化
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(train_X)
scaled_train_data = scaler.transform(train_X)
scaled_test_data = scaler.transform(test_X)
# In[5]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(10,),
activation='logistic', alpha=0.1, max_iter=1000)
mlp.fit(scaled_train_data, train_y)
mlp
#%%
predict_test2 = mlp.predict(scaled_test_data)
test_proba2 = mlp.predict_proba(scaled_test_data)[:, 1]
from sklearn import metrics
fpr_test2, tpr_test2, th_test2 = metrics.roc_curve(test_y, test_proba2)
print('AUC = %6.4f' %metrics.auc(fpr_test2, tpr_test2))
#print(metrics.classification_report(test_y, predict_test2))
#%%
##RF建模
import sklearn.ensemble as ensemble
param_grid = {
'criterion':['entropy','gini'],
'max_depth':[4,5,6,7],
'n_estimators':[24,26,28], #决策树个数-随机森林特有参数
'max_features':[0.02,0.04,0.4], #每棵决策树使用的变量占比-随机森林特有参数
'min_samples_split':[74,75,78]
}
rfc = ensemble.RandomForestClassifier()
rfccv = GridSearchCV(estimator=rfc, param_grid=param_grid, scoring='roc_auc', cv=4,n_jobs=-1)
rfccv.fit(train_X, train_y)
test_est3 = rfccv.predict(test_X)
fpr_test3,tpr_test3,th_test3=metrics.roc_curve(test_y,test_est3)
print('decision tree ROC')
print('ROC=%.4f' %metrics.auc(fpr_test3,tpr_test3))
#%%
rfccv.best_params_
#%%
#Adaboost算法
param_grid = {
#'base_estimator':['DecisionTreeClassifier'],
'learning_rate':[0.1,0.3,0.5,0.7,1]
}
abc = ensemble.AdaBoostClassifier(n_estimators=100,algorithm='SAMME')
abccv = GridSearchCV(estimator=abc, param_grid=param_grid, scoring='roc_auc', cv=4,n_jobs=-1)
abccv.fit(train_X, train_y)
test_est4 = abccv.predict(test_X)
print("abc classifier accuracy:")
print(metrics.classification_report(test_y,test_est4))
fpr_test4,tpr_test4,th_test4=metrics.roc_curve(test_y,test_est4)
print('decision tree ROC')
print('ROC=%.4f' %metrics.auc(fpr_test4,tpr_test4))
#%%
abccv.best_params_
# In[ ]:
#GBDT
param_grid = {
'loss':['deviance','exponential'],
'learning_rate':[0.1,0.3,0.8],
'n_estimators':[26,28,30,32], #决策树个数-GBDT特有参数
'max_depth':[4,5,6], #单棵树最大深度-GBDT特有参数
'min_samples_split':[2,15,20,25]
}
gbc = ensemble.GradientBoostingClassifier()
gbccv = GridSearchCV(estimator=gbc, param_grid=param_grid, scoring='roc_auc', cv=4,n_jobs=-1)
gbccv.fit(train_X, train_y)
test_est5 = gbccv.predict(test_X)
print("gradient boosting accuracy:")
print(metrics.classification_report(test_y,test_est5))
fpr_test5,tpr_test5,th_test5=metrics.roc_curve(test_y,test_est5)
print('decision tree ROC')
print('ROC=%.4f' %metrics.auc(fpr_test5,tpr_test5))
#%%
gbccv.best_params_
#%%#GBDT
param_grid = {
'loss':['deviance','exponential'],
'learning_rate':[0.05,0.1,0.2,0.8],
'n_estimators':[26,30,70,80,90], #决策树个数-GBDT特有参数
'max_depth':[3,4,5], #单棵树最大深度-GBDT特有参数
'min_samples_split':[2,3]
}
gbc = ensemble.GradientBoostingClassifier(random_state=12)
gbccv = GridSearchCV(estimator=gbc, param_grid=param_grid, scoring='roc_auc', cv=4,n_jobs=-1)
gbccv.fit(train_X, train_y)
test_est5 = gbccv.predict(test_X)
print("gradient boosting accuracy:")
print(metrics.classification_report(test_y,test_est5))
fpr_test5,tpr_test5,th_test5=metrics.roc_curve(test_y,test_est5)
print('decision tree ROC')
print('ROC=%.4f' %metrics.auc(fpr_test5,tpr_test5))
gbccv.best_params_
#%%
#PR曲线
import matplotlib.pyplot as plt
plt.figure(figsize=[6,6])
plt.plot(tpr_test2,1-fpr_test2)
plt.ylabel('recall')
plt.xlabel('precisoin')
plt.title('pr curve')
plt.show()
# coding: utf-8
# #### 1、背景介绍:
#
# C银行信用卡中心在对欺诈风险和反欺诈技术作了充分研究之后,融合内外部数据,建立以评分模型为支撑的欺诈识别和防范系统,以满足精细化管理的需要。本次作业根据提供的数据(“FRAUD_TRAIN_Samp.csv”,引用自陈春宝等出版的《SAS金融数据挖掘与建模》)建立信用卡申请反欺诈模型。
#
# #### 2、主要变量说明如下:
#
# #无-组合算法为黑箱模型,无需知道变量含义
#
# #### 3、作业安排:
#
# - 3.1 基础知识:
#
# 1)比较逻辑回归、决策树、神经网络、组合算法的适用场景和优缺点。
#
# - 3.2 案例解答步骤如下:
#
# 1)使用决策树、神经网络、组合算法建立反欺诈模型,比较三个模型的表现。
#
# 2)自学绘制PR曲线:横轴为精确度(Precise),纵轴为召回率(Recall)。
# In[34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.neural_network import MLPClassifier
get_ipython().run_line_magic('matplotlib', 'inline')
sample=pd.read_excel('samp.xlsx',encoding='utf-8')
# In[35]:
##观测样本目标labeld的分布情况
# In[37]:
sns.distplot(sample.target,kde=True,fit=stats.norm)
# In[38]:
sample_1=sample.loc[sample.iloc[:,1]==1,:]
sample_0=sample.loc[sample.iloc[:,1]==0,:]
# In[39]:
# In[40]:
#初步数据清洗
##数据预处理F81 F74 F75
from pylab import mpl
mpl.rcParams['font.sans-serif']=['SimHei']#指定默认字体
mpl.rcParams['axes.unicode_minus']=False
sample.F81=sample.F81.fillna('缺失')
sample.F81.value_counts().plot(kind='bar')
dict81={'无等级':1,'缺失':2,'平衡':3,'稳健':4,'成长':5,'进取':6,'保守':7}
sample.F81=sample.F81.map(dict81)
sample.F74.value_counts().plot(kind='bar')
dict74={'lvyou':1,'feiji':2,'tielu':3,'xchen':4,'JIUDIAN':5,'quna':6,'JYY':7}
sample.F74=sample.F81.map(dict74)
# In[41]:
sample.F75=sample.F75.fillna('O')
sample.F75.unique()
# In[42]:
sample.F75.value_counts().plot(kind='bar')
dict75={'M':0,'F':1,'O':2}
sample.F75=sample.F75.map(dict75)
# In[43]:
#针对空值转化为0,针对非数字型记录其索引
cloindex=[]
for i in range(245):
sample.iloc[:,i+2]=sample.iloc[:,i+2].fillna(0)
sample=sample.drop('F80',1)
# In[44]:
# In[45]:
sample.target.value_counts().plot(kind='bar')
# #### 使用决策树构建模型
# In[46]:
from sklearn import cross_validation
X_data=sample.iloc[:,2:]
y_data=sample.iloc[:,1]
# In[47]:
X_train,X_test,y_train,y_test=cross_validation.train_test_split(
X_data,y_data,test_size=0.35,random_state=1
)
# In[48]:
regr=DecisionTreeClassifier()
regr.fit(X_train,y_train)
print('Training score:%f'%(regr.score(X_train,y_train)))
print('Testing score:%f'%(regr.score(X_test,y_test)))
# In[49]:
result_train=regr.predict(X_train)
result_train_proba=regr.predict_proba(X_train)[:,1]
result_test=regr.predict(X_test)
result_test_proba=regr.predict_proba(X_test)[:,1]
pd.DataFrame({
'y_test':y_test,
'result_test':result_test,
'result_test_proba':result_test_proba
}).T
# In[61]:
import sklearn.metrics as metrics
print(metrics.confusion_matrix(y_test,result_test,labels=[0,1]))
print(metrics.classification_report(y_test,result_test))
#print(pd.DataFrame(list(zip(sample.columns,regr.feature_importances_))))
# In[56]:
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
# In[58]:
param_grid={
'max_depth':[2,3,4,5,6,7,8],
'min_samples_split':[4,8,12,16,20,24,28]
}
clf=DecisionTreeClassifier(criterion='entropy')
clfcv=GridSearchCV(estimator=clf,param_grid=param_grid,
scoring='roc_auc',cv=4)
clfcv.fit(X_train,y_train)
# In[158]:
mlp=MLPClassifier(hidden_layer_sizes=(10,),
activation='logistic',
alpha=0.1,
max_iter=100000)
mlp.fit(X_train,y_train)
mlp
# In[144]:
train_predict=mlp.predict()
# In[164]:
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
param_grid={
'max_iter':[1000,2000,3000,4000,5000],
'hidden_layer_sizes':[(10,),(15,),(20,),(5,5)],
'activation':['identity','logistic','tanh','relu'],
'alpha':[0.001,0.01,0.1,0.2,0.4,1,10]
}
mlp=MLPClassifier()
gcv=GridSearchCV(estimator=mlp,param_grid=param_grid,
scoring='roc_auc',cv=4,n_jobs=1)
gcv.fit(X_train,y_train)
gcv.best_params_
# In[51]:
import seaborn as sns
red,blue=sns.color_palette('Set1',2)
sns.distplot(result_test_proba[y_test==1],kde=False,bins=15,color=red)
sns.distplot(result_test_proba[y_test==0],kde=False,bins=15,color=blue)
plt.show()
# In[59]:
#fpr_test,tpr_test,th_test=metrics.roc_curve(result_test,result_test_proba)
#fpr_train,tpr_train,th_train=metrics.roc_curve(result_train,result_train_proba)
fpr_test,tpr_test,th_test=metrics.roc_curve(y_test,result_test)
fpr_train,tpr_train,th_train=metrics.roc_curve(y_train,result_train)
plt.figure(figsize=[6,6])
plt.plot(fpr_test,tpr_test,color='blue')
plt.plot(fpr_train,tpr_train,color='red')
plt.title('ROC curve')
plt.show()
# In[62]:
p_test,r_test,_=metrics.precision_recall_curve(y_test,result_test)
p_train,r_train,_=metrics.precision_recall_curve(y_train,result_train)
plt.figure(figsize=[6,6])
plt.plot(r_test,p_test,color='blue')
plt.plot(r_train,p_train,color='red')
plt.title('P-R 曲线')
plt.show()
# #### 1、背景介绍:
#
# C银行信用卡中心在对欺诈风险和反欺诈技术作了充分研究之后,融合内外部数据,建立以评分模型为支撑的欺诈识别和防范系统,以满足精细化管理的需要。本次作业根据提供的数据(“FRAUD_TRAIN_Samp.csv”,引用自陈春宝等出版的《SAS金融数据挖掘与建模》)建立信用卡申请反欺诈模型。
#
# #### 2、主要变量说明如下:
#
# #无-组合算法为黑箱模型,无需知道变量含义
#
# #### 3、作业安排:
#
# - 3.1 基础知识:
#
# 1)比较逻辑回归、决策树、神经网络、组合算法的适用场景和优缺点。
#
# - 3.2 案例解答步骤如下:
#
# 1)使用决策树、神经网络、组合算法建立反欺诈模型,比较三个模型的表现。
#
# 2)自学绘制PR曲线:横轴为精确度(Precise),纵轴为召回率(Recall)。
# In[34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.neural_network import MLPClassifier
get_ipython().run_line_magic('matplotlib', 'inline')
sample=pd.read_excel('samp.xlsx',encoding='utf-8')
# In[35]:
##观测样本目标labeld的分布情况
# In[37]:
sns.distplot(sample.target,kde=True,fit=stats.norm)
# In[38]:
sample_1=sample.loc[sample.iloc[:,1]==1,:]
sample_0=sample.loc[sample.iloc[:,1]==0,:]
# In[39]:
# In[40]:
#初步数据清洗
##数据预处理F81 F74 F75
from pylab import mpl
mpl.rcParams['font.sans-serif']=['SimHei']#指定默认字体
mpl.rcParams['axes.unicode_minus']=False
sample.F81=sample.F81.fillna('缺失')
sample.F81.value_counts().plot(kind='bar')
dict81={'无等级':1,'缺失':2,'平衡':3,'稳健':4,'成长':5,'进取':6,'保守':7}
sample.F81=sample.F81.map(dict81)
sample.F74.value_counts().plot(kind='bar')
dict74={'lvyou':1,'feiji':2,'tielu':3,'xchen':4,'JIUDIAN':5,'quna':6,'JYY':7}
sample.F74=sample.F81.map(dict74)
# In[41]:
sample.F75=sample.F75.fillna('O')
sample.F75.unique()
# In[42]:
sample.F75.value_counts().plot(kind='bar')
dict75={'M':0,'F':1,'O':2}
sample.F75=sample.F75.map(dict75)
# In[43]:
#针对空值转化为0,针对非数字型记录其索引
cloindex=[]
for i in range(245):
sample.iloc[:,i+2]=sample.iloc[:,i+2].fillna(0)
sample=sample.drop('F80',1)
# In[44]:
# In[45]:
sample.target.value_counts().plot(kind='bar')
# #### 使用决策树构建模型
# In[46]:
from sklearn import cross_validation
X_data=sample.iloc[:,2:]
y_data=sample.iloc[:,1]
# In[47]:
X_train,X_test,y_train,y_test=cross_validation.train_test_split(
X_data,y_data,test_size=0.35,random_state=1
)
# In[48]:
regr=DecisionTreeClassifier()
regr.fit(X_train,y_train)
print('Training score:%f'%(regr.score(X_train,y_train)))
print('Testing score:%f'%(regr.score(X_test,y_test)))
# In[49]:
result_train=regr.predict(X_train)
result_train_proba=regr.predict_proba(X_train)[:,1]
result_test=regr.predict(X_test)
result_test_proba=regr.predict_proba(X_test)[:,1]
pd.DataFrame({
'y_test':y_test,
'result_test':result_test,
'result_test_proba':result_test_proba
}).T
# In[61]:
import sklearn.metrics as metrics
print(metrics.confusion_matrix(y_test,result_test,labels=[0,1]))
print(metrics.classification_report(y_test,result_test))
#print(pd.DataFrame(list(zip(sample.columns,regr.feature_importances_))))
# In[56]:
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
# In[58]:
param_grid={
'max_depth':[2,3,4,5,6,7,8],
'min_samples_split':[4,8,12,16,20,24,28]
}
clf=DecisionTreeClassifier(criterion='entropy')
clfcv=GridSearchCV(estimator=clf,param_grid=param_grid,
scoring='roc_auc',cv=4)
clfcv.fit(X_train,y_train)
# In[158]:
mlp=MLPClassifier(hidden_layer_sizes=(10,),
activation='logistic',
alpha=0.1,
max_iter=100000)
mlp.fit(X_train,y_train)
mlp
# In[144]:
train_predict=mlp.predict()
# In[164]:
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
param_grid={
'max_iter':[1000,2000,3000,4000,5000],
'hidden_layer_sizes':[(10,),(15,),(20,),(5,5)],
'activation':['identity','logistic','tanh','relu'],
'alpha':[0.001,0.01,0.1,0.2,0.4,1,10]
}
mlp=MLPClassifier()
gcv=GridSearchCV(estimator=mlp,param_grid=param_grid,
scoring='roc_auc',cv=4,n_jobs=1)
gcv.fit(X_train,y_train)
gcv.best_params_
# In[51]:
import seaborn as sns
red,blue=sns.color_palette('Set1',2)
sns.distplot(result_test_proba[y_test==1],kde=False,bins=15,color=red)
sns.distplot(result_test_proba[y_test==0],kde=False,bins=15,color=blue)
plt.show()
# In[59]:
#fpr_test,tpr_test,th_test=metrics.roc_curve(result_test,result_test_proba)
#fpr_train,tpr_train,th_train=metrics.roc_curve(result_train,result_train_proba)
fpr_test,tpr_test,th_test=metrics.roc_curve(y_test,result_test)
fpr_train,tpr_train,th_train=metrics.roc_curve(y_train,result_train)
plt.figure(figsize=[6,6])
plt.plot(fpr_test,tpr_test,color='blue')
plt.plot(fpr_train,tpr_train,color='red')
plt.title('ROC curve')
plt.show()
# In[62]:
p_test,r_test,_=metrics.precision_recall_curve(y_test,result_test)
p_train,r_train,_=metrics.precision_recall_curve(y_train,result_train)
plt.figure(figsize=[6,6])
plt.plot(r_test,p_test,color='blue')
plt.plot(r_train,p_train,color='red')
plt.title('P-R 曲线')
plt.show()