1、背景介绍:
C银行信用卡中心在对欺诈风险和反欺诈技术作了充分研究之后,融合内外部数据,建立以评分模型为支撑的欺诈识别和防范系统,以满足精细化管理的需要。本次作业根据提供的数据(“FRAUD_TRAIN_Samp.csv”,引用自陈春宝等出版的《SAS金融数据挖掘与建模》)建立信用卡申请反欺诈模型。
2、主要变量说明如下:
#无-组合算法为黑箱模型,无需知道变量含义
3、作业安排:
3.1 基础知识:
1)比较逻辑回归、决策树、神经网络、组合算法的适用场景和优缺点。
3.2 案例解答步骤如下:
1)使用决策树、神经网络、组合算法建立反欺诈模型,比较三个模型的表现。
2)自学绘制PR曲线:横轴为精确度(Precise),纵轴为召回率(Recall)。
In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.neural_network import MLPClassifier
%matplotlib inline
sample=pd.read_excel('samp.xlsx',encoding='utf-8')
In [35]:
sample.head()
Out[35]:
csr_idtargetF2F3F4F5F6F7F8F9...F237F238F239F240F241F242F243F244F245F246
085350100601100...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
173450000431118012...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
28437030090801...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
3616601600502310016...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4937001400540004...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
5 rows × 247 columns
In [36]:
sample.describe()
Out[36]:
csr_idtargetF2F3F4F5F6F7F8F9...F237F238F239F240F241F242F243F244F245F246
count972.000000972.000000972.000000972.000000972.000000972.000000972.000000972.000000972.000000972.000000...7.07.07.0000007.0000007.07.07.07.0000007.0000007.000000
mean5109.4650210.05041210.0668720.1460910.85802527.9907419.3014409.2407414.5257207.253086...0.00.071642.85714371642.8571430.00.00.083282.81428683711.38571412068.528571
std2894.0404750.21890520.5666551.33681713.72603830.70907518.49231420.66384518.31682412.743680...0.00.0145591.527422145591.5274220.00.00.0188748.601642188531.264044264389.597515
min20.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.00.00.0000000.0000000.00.00.00.0000000.000000-400000.000000
25%2530.5000000.0000001.7500000.0000000.0000008.0000000.0000001.0000000.0000000.000000...0.00.03500.0000003500.0000000.00.00.00.0000001500.000000-13209.535000
50%5296.0000000.0000007.0000000.0000000.00000020.0000003.0000004.0000000.0000003.000000...0.00.025000.00000025000.0000000.00.00.012780.93000012780.930000-7000.000000
75%7628.2500000.00000012.0000000.0000000.00000039.00000011.00000012.0000003.00000010.000000...0.00.034750.00000034750.0000000.00.00.030150.00000030150.0000004000.000000
max9996.0000001.000000483.00000030.000000427.000000327.000000267.000000538.000000457.000000262.000000...0.00.0400000.000000400000.0000000.00.00.0509898.770000509898.770000509898.770000
8 rows × 243 columns
In [37]:
sns.distplot(sample.target,kde=True,fit=stats.norm)
Out[37]:
<matplotlib.axes._subplots.AxesSubplot at 0x23d3bdfebe0>
In [38]:
sample_1=sample.loc[sample.iloc[:,1]==1,:]
sample_0=sample.loc[sample.iloc[:,1]==0,:]
In [39]:
sample_1.describe()
Out[39]:
csr_idtargetF2F3F4F5F6F7F8F9...F237F238F239F240F241F242F243F244F245F246
count49.00000049.049.00000049.00000049.00000049.00000049.00000049.00000049.00000049.000000...1.01.01.01.01.01.01.01.01.01.0
mean251.8571431.08.0612240.2040820.71428631.30612210.1632659.3469392.6530617.714286...0.00.0400000.0400000.00.00.00.00.00.0-400000.0
std129.9474250.07.5482010.7065051.48604623.44426418.25210810.5347665.3211499.438397...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
min20.0000001.00.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.00.0400000.0400000.00.00.00.00.00.0-400000.0
259.0000001.02.0000000.0000000.00000014.0000000.0000001.0000000.0000000.000000...0.00.0400000.0400000.00.00.00.00.00.0-400000.0
50%233.0000001.07.0000000.0000000.00000027.0000004.0000007.0000000.0000004.000000...0.00.0400000.0400000.00.00.00.00.00.0-400000.0
75%372.0000001.012.0000000.0000000.00000043.00000012.00000013.0000002.00000013.000000...0.00.0400000.0400000.00.00.00.00.00.0-400000.0
max485.0000001.038.0000003.0000007.000000109.000000113.00000047.00000027.00000031.000000...0.00.0400000.0400000.00.00.00.00.00.0-400000.0
8 rows × 243 columns
In [40]:
from pylab import mpl
mpl.rcParams['font.sans-serif']=['SimHei']
mpl.rcParams['axes.unicode_minus']=False
sample.F81=sample.F81.fillna('缺失')
sample.F81.value_counts().plot(kind='bar')
dict81={'无等级':1,'缺失':2,'平衡':3,'稳健':4,'成长':5,'进取':6,'保守':7}
sample.F81=sample.F81.map(dict81)
sample.F74.value_counts().plot(kind='bar')
dict74={'lvyou':1,'feiji':2,'tielu':3,'xchen':4,'JIUDIAN':5,'quna':6,'JYY':7}
sample.F74=sample.F81.map(dict74)
In [41]:
sample.F75=sample.F75.fillna('O')
sample.F75.unique()
Out[41]:
array(['M', 'F', 'O'], dtype=object)
In [42]:
sample.F75.value_counts().plot(kind='bar')
dict75={'M':0,'F':1,'O':2}
sample.F75=sample.F75.map(dict75)
In [43]:
cloindex=[]
for i in range(245):
sample.iloc[:,i+2]=sample.iloc[:,i+2].fillna(0)
sample=sample.drop('F80',1)
In [44]:
sample.head()
Out[44]:
csr_idtargetF2F3F4F5F6F7F8F9...F237F238F239F240F241F242F243F244F245F246
085350100601100...0.00.00.00.00.00.00.00.00.00.0
173450000431118012...0.00.00.00.00.00.00.00.00.00.0
28437030090801...0.00.00.00.00.00.00.00.00.00.0
3616601600502310016...0.00.00.00.00.00.00.00.00.00.0
4937001400540004...0.00.00.00.00.00.00.00.00.00.0
5 rows × 246 columns
In [45]:
sample.target.value_counts().plot(kind='bar')
Out[45]:
<matplotlib.axes._subplots.AxesSubplot at 0x23d3c00d748>
使用决策树构建模型
In [46]:
from sklearn import cross_validation
X_data=sample.iloc[:,2:]
y_data=sample.iloc[:,1]
E:\Anaconda3\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
"This module will be removed in 0.20.", DeprecationWarning)
In [47]:
X_train,X_test,y_train,y_test=cross_validation.train_test_split(
X_data,y_data,test_size=0.35,random_state=1
)
In [48]:
regr=DecisionTreeClassifier()
regr.fit(X_train,y_train)
print('Training score:%f'%(regr.score(X_train,y_train)))
print('Testing score:%f'%(regr.score(X_test,y_test)))
Training score:1.000000
Testing score:0.891496
In [49]:
result_train=regr.predict(X_train)
result_train_proba=regr.predict_proba(X_train)[:,1]
result_test=regr.predict(X_test)
result_test_proba=regr.predict_proba(X_test)[:,1]
pd.DataFrame({
'y_test':y_test,
'result_test':result_test,
'result_test_proba':result_test_proba
}).T
Out[49]:
51756020569824194431132934133...921149527695435513257355458
y_test0.00.00.00.00.00.01.01.00.00.0...0.00.00.01.01.00.00.00.00.00.0
result_test0.00.00.00.00.01.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
result_test_proba0.00.00.00.00.01.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
3 rows × 341 columns
In [61]:
import sklearn.metrics as metrics
print(metrics.confusion_matrix(y_test,result_test,labels=[0,1]))
print(metrics.classification_report(y_test,result_test))
[[301 22]
[ 15 3]]
precision recall f1-score support
0 0.95 0.93 0.94 323
1 0.12 0.17 0.14 18
avg / total 0.91 0.89 0.90 341
In [51]:
import seaborn as sns
red,blue=sns.color_palette('Set1',2)
sns.distplot(result_test_proba[y_test==1],kde=False,bins=15,color=red)
sns.distplot(result_test_proba[y_test==0],kde=False,bins=15,color=blue)
plt.show()
In [59]:
fpr_test,tpr_test,th_test=metrics.roc_curve(y_test,result_test)
fpr_train,tpr_train,th_train=metrics.roc_curve(y_train,result_train)
plt.figure(figsize=[6,6])
plt.plot(fpr_test,tpr_test,color='blue')
plt.plot(fpr_train,tpr_train,color='red')
plt.title('ROC curve')
plt.show()
In [62]:
p_test,r_test,_=metrics.precision_recall_curve(y_test,result_test)
p_train,r_train,_=metrics.precision_recall_curve(y_train,result_train)
plt.figure(figsize=[6,6])
plt.plot(r_test,p_test,color='blue')
plt.plot(r_train,p_train,color='red')
plt.title('P-R 曲线')
plt.show()
In [56]:
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
In [58]:
param_grid={
'max_depth':[2,3,4,5,6,7,8],
'min_samples_split':[4,8,12,16,20,24,28]
}
clf=DecisionTreeClassifier(criterion='entropy')
clfcv=GridSearchCV(estimator=clf,param_grid=param_grid,
scoring='roc_auc',cv=4)
clfcv.fit(X_train,y_train)
Out[58]:
GridSearchCV(cv=4, error_score='raise',
estimator=DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort=False, random_state=None,
splitter='best'),
fit_params=None, iid=True, n_jobs=1,
param_grid={'max_depth': [2, 3, 4, 5, 6, 7, 8], 'min_samples_split': [4, 8, 12, 16, 20, 24, 28]},
pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
scoring='roc_auc', verbose=0)
In [ ]:
In [ ]:
In [158]:
mlp=MLPClassifier(hidden_layer_sizes=(10,),
activation='logistic',
alpha=0.1,
max_iter=100000)
mlp.fit(X_train,y_train)
mlp
Out[158]:
MLPClassifier(activation='logistic', alpha=0.1, batch_size='auto', beta_1=0.9,
beta_2=0.999, early_stopping=False, epsilon=1e-08,
hidden_layer_sizes=(10,), learning_rate='constant',
learning_rate_init=0.001, max_iter=100000, momentum=0.9,
nesterovs_momentum=True, power_t=0.5, random_state=None,
shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
verbose=False, warm_start=False)
In [144]:
train_predict=mlp.predict()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-144-5802580b8a93> in <module>()
----> 1 type(sample.iloc[5,2])==flaot
NameError: name 'flaot' is not defined
In [164]:
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
param_grid={
'max_iter':[1000,2000,3000,4000,5000],
'hidden_layer_sizes':[(10,),(15,),(20,),(5,5)],
'activation':['identity','logistic','tanh','relu'],
'alpha':[0.001,0.01,0.1,0.2,0.4,1,10]
}
mlp=MLPClassifier()
gcv=GridSearchCV(estimator=mlp,param_grid=param_grid,
scoring='roc_auc',cv=4,n_jobs=1)
gcv.fit(X_train,y_train)
Out[164]:
GridSearchCV(cv=4, error_score='raise',
estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
beta_2=0.999, early_stopping=False, epsilon=1e-08,
hidden_layer_sizes=(100,), learning_rate='constant',
learning_rate_init=0.001, max_iter=200, momentum=0.9,
nesterovs_momentum=True, power_t=0.5, random_state=None,
shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
verbose=False, warm_start=False),
fit_params=None, iid=True, n_jobs=1,
param_grid={'max_iter': [1000, 2000, 3000, 4000, 5000], 'hidden_layer_sizes': [(10,), (15,), (20,), (5, 5)], 'activation': ['identity', 'logistic', 'tanh', 'relu'], 'alpha': [0.001, 0.01, 0.1, 0.2, 0.4, 1, 10]},
pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
scoring='roc_auc', verbose=0)