本次作业主要增加了对于变量的处理和分析,但是实际模型效果还是不太理想。
应该是某个环节还存在问题,需要进一步跟大家沟通交流学习。
# coding: utf-8
# In[1]:
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import GridSearchCV
# In[2]:
fraud_data = pd.read_excel(r'FRAUD_TRAIN_Samp.xlsx')
fraud_data.head()
# In[3]:
y = fraud_data['target']
X = fraud_data.iloc[:,2:]
# In[4]:
X.shape
# In[5]:
missing= (X.isnull().sum()/X.shape[0]).sort_values(ascending=False)
missing
# In[6]:
# 删除缺失率0.9以上的变量
X_nomissing = X.drop(missing[missing>0.9].index,axis=1)
missing = missing.drop(missing[missing>0.9].index)
X_nomissing.shape
# In[7]:
# 缺失率小于0.7的变量以均值填补
for idx in missing[(missing<0.7)].index:
# print(idx)
if idx in('F75', 'F81','F74'):
vm = X_nomissing[idx].mode()[0]
else:
vm = X_nomissing[idx].mean()
X_nomissing[idx].fillna(vm,inplace = True)
# In[8]:
# 缺失率大于0.7的变量生成是否缺失标识,并以均值填补
for idx in missing[(missing>0.7)].index:
new_col = idx + 'is_na'
X_nomissing[new_col] = X_nomissing[idx].map(bool).map(int)
vmean = X_nomissing[idx].mean()
X_nomissing[idx].fillna(vmean,inplace = True)
# In[9]:
map_dict = {}
for col in ('F75', 'F81','F74'):
tmp_df = pd.DataFrame(X[col].value_counts())
tmp_df['num'] = range(1,len(tmp_df)+1)
tmp_df.columns = ['num',col]
tmp_df.drop('num',axis=1,inplace = True)
tmp_dict = tmp_df.to_dict(orient = 'dict')
map_dict.update(tmp_dict)
print(tmp_dict)
map_dict
# In[10]:
X_nomissing.replace(map_dict,inplace=True)
# In[11]:
missing2= (X_nomissing.isnull().sum()/X_nomissing.shape[0]).sort_values(ascending=False)
missing2
# In[12]:
X_train,X_test,y_train,y_test = train_test_split(X_nomissing,y,train_size = 0.7,random_state = 123)
# # 决策树
# In[13]:
import sklearn.tree as tree
param_grid = {
'criterion':['entropy','gini'],
'max_depth':[4,5,6,7,8,9],
'min_samples_split':[6,8,10,14,18,20,22,24]
}
clf1 = tree.DecisionTreeClassifier()
clf1cv = GridSearchCV(estimator=clf1,param_grid=param_grid,scoring='roc_auc',cv = 4)
clf1cv.fit(X_train,y_train)
# In[14]:
test_pre = clf1cv.predict(X_test)
# In[15]:
print("decision tree accuracy:")
print(metrics.classification_report(y_test,test_pre))
print("decision tree AUC:")
fpr_test, tpr_test, th_test = metrics.roc_curve(y_test,test_pre)
print('AUC = %.4f' %metrics.auc(fpr_test, tpr_test))
# In[16]:
clf1cv.best_params_
# # ANN
# In[17]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)
X_test_sc = scaler.transform(X_test)
# In[19]:
from sklearn.neural_network import MLPClassifier
param_grid = {
'hidden_layer_sizes':[(5, ), (10, ), (15, ), (20, ),(5,5)],
'activation':['logistic', 'tanh', 'relu'],
'alpha':[0.001, 0.01, 0.1, 0.2, 0.3, 0.4,0.6,0.8,1,2,4,6,8,10]
}
mlp = MLPClassifier(max_iter=1000)
gcv = GridSearchCV(estimator=mlp, param_grid=param_grid,
scoring='roc_auc', cv=4, n_jobs=-1)
gcv.fit(X_train_sc, y_train)
# In[20]:
test_pre = clf1cv.predict(X_test_sc)
# In[22]:
print("ANN accuracy:")
print(metrics.classification_report(y_test,test_pre))
print("ANN AUC:")
fpr_test, tpr_test, th_test = metrics.roc_curve(y_test,test_pre)
print('AUC = %.4f' %metrics.auc(fpr_test, tpr_test))
# In[23]:
gcv.best_params_
# # GBDT
# In[25]:
import sklearn.ensemble as ensemble
# In[26]:
param_grid = {
'loss':['deviance','exponential'],
'learning_rate':[0.1,0.3,0.5,0.7,1],
'n_estimators':[10,20,30,40],
'max_depth':[2,4,6,8,10,12,15],
'min_samples_split':[8,14,20,22]
}
gbc = ensemble.GradientBoostingClassifier()
gbccv = GridSearchCV(estimator=gbc, param_grid=param_grid, scoring='roc_auc', cv=4)
gbccv.fit(X_train, y_train)
# In[28]:
test_pre = gbccv.predict(X_test)
# In[29]:
print("gradient boosting accuracy:")
print(metrics.classification_report(y_test,test_pre))
print("gradient boosting AUC:")
fpr_test, tpr_test, th_test = metrics.roc_curve(y_test,test_pre)
print('AUC = %.4f' %metrics.auc(fpr_test, tpr_test))
# In[30]:
gbccv.best_params_
# In[ ]: