八大直播作业-第七讲-Aringrhus

浏览: 1335

本次作业主要增加了对于变量的处理和分析,但是实际模型效果还是不太理想。

应该是某个环节还存在问题,需要进一步跟大家沟通交流学习。


# coding: utf-8

# In[1]:


import numpy as np
import pandas as pd
import sklearn.metrics as metrics
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import GridSearchCV


# In[2]:


fraud_data = pd.read_excel(r'FRAUD_TRAIN_Samp.xlsx')
fraud_data.head()


# In[3]:


y = fraud_data['target']
X = fraud_data.iloc[:,2:]


# In[4]:


X.shape


# In[5]:


missing= (X.isnull().sum()/X.shape[0]).sort_values(ascending=False)
missing


# In[6]:


# 删除缺失率0.9以上的变量
X_nomissing = X.drop(missing[missing>0.9].index,axis=1)
missing = missing.drop(missing[missing>0.9].index)
X_nomissing.shape


# In[7]:


# 缺失率小于0.7的变量以均值填补
for idx in missing[(missing<0.7)].index:
# print(idx)
if idx in('F75', 'F81','F74'):
vm = X_nomissing[idx].mode()[0]
else:
vm = X_nomissing[idx].mean()
X_nomissing[idx].fillna(vm,inplace = True)


# In[8]:


# 缺失率大于0.7的变量生成是否缺失标识,并以均值填补
for idx in missing[(missing>0.7)].index:
new_col = idx + 'is_na'
X_nomissing[new_col] = X_nomissing[idx].map(bool).map(int)
vmean = X_nomissing[idx].mean()
X_nomissing[idx].fillna(vmean,inplace = True)


# In[9]:


map_dict = {}
for col in ('F75', 'F81','F74'):
tmp_df = pd.DataFrame(X[col].value_counts())
tmp_df['num'] = range(1,len(tmp_df)+1)
tmp_df.columns = ['num',col]
tmp_df.drop('num',axis=1,inplace = True)
tmp_dict = tmp_df.to_dict(orient = 'dict')
map_dict.update(tmp_dict)
print(tmp_dict)

map_dict


# In[10]:


X_nomissing.replace(map_dict,inplace=True)


# In[11]:


missing2= (X_nomissing.isnull().sum()/X_nomissing.shape[0]).sort_values(ascending=False)
missing2


# In[12]:


X_train,X_test,y_train,y_test = train_test_split(X_nomissing,y,train_size = 0.7,random_state = 123)


# # 决策树

# In[13]:


import sklearn.tree as tree
param_grid = {
'criterion':['entropy','gini'],
'max_depth':[4,5,6,7,8,9],
'min_samples_split':[6,8,10,14,18,20,22,24]
}
clf1 = tree.DecisionTreeClassifier()
clf1cv = GridSearchCV(estimator=clf1,param_grid=param_grid,scoring='roc_auc',cv = 4)
clf1cv.fit(X_train,y_train)


# In[14]:


test_pre = clf1cv.predict(X_test)


# In[15]:


print("decision tree accuracy:")
print(metrics.classification_report(y_test,test_pre))
print("decision tree AUC:")
fpr_test, tpr_test, th_test = metrics.roc_curve(y_test,test_pre)
print('AUC = %.4f' %metrics.auc(fpr_test, tpr_test))


# In[16]:


clf1cv.best_params_


# # ANN

# In[17]:


from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_train)

X_train_sc = scaler.transform(X_train)
X_test_sc = scaler.transform(X_test)


# In[19]:


from sklearn.neural_network import MLPClassifier
param_grid = {
'hidden_layer_sizes':[(5, ), (10, ), (15, ), (20, ),(5,5)],
'activation':['logistic', 'tanh', 'relu'],
'alpha':[0.001, 0.01, 0.1, 0.2, 0.3, 0.4,0.6,0.8,1,2,4,6,8,10]
}
mlp = MLPClassifier(max_iter=1000)
gcv = GridSearchCV(estimator=mlp, param_grid=param_grid,
scoring='roc_auc', cv=4, n_jobs=-1)
gcv.fit(X_train_sc, y_train)


# In[20]:


test_pre = clf1cv.predict(X_test_sc)


# In[22]:


print("ANN accuracy:")
print(metrics.classification_report(y_test,test_pre))
print("ANN AUC:")
fpr_test, tpr_test, th_test = metrics.roc_curve(y_test,test_pre)
print('AUC = %.4f' %metrics.auc(fpr_test, tpr_test))


# In[23]:


gcv.best_params_


# # GBDT

# In[25]:


import sklearn.ensemble as ensemble


# In[26]:


param_grid = {
'loss':['deviance','exponential'],
'learning_rate':[0.1,0.3,0.5,0.7,1],
'n_estimators':[10,20,30,40],
'max_depth':[2,4,6,8,10,12,15],
'min_samples_split':[8,14,20,22]

}

gbc = ensemble.GradientBoostingClassifier()
gbccv = GridSearchCV(estimator=gbc, param_grid=param_grid, scoring='roc_auc', cv=4)
gbccv.fit(X_train, y_train)


# In[28]:


test_pre = gbccv.predict(X_test)


# In[29]:


print("gradient boosting accuracy:")
print(metrics.classification_report(y_test,test_pre))
print("gradient boosting AUC:")
fpr_test, tpr_test, th_test = metrics.roc_curve(y_test,test_pre)
print('AUC = %.4f' %metrics.auc(fpr_test, tpr_test))


# In[30]:


gbccv.best_params_


# In[ ]:





推荐 1
本文由 顺子的 创作,采用 知识共享署名-相同方式共享 3.0 中国大陆许可协议 进行许可。
转载、引用前需联系作者,并署名作者且注明文章出处。
本站文章版权归原作者及原出处所有 。内容为作者个人观点, 并不代表本站赞同其观点和对其真实性负责。本站是一个个人学习交流的平台,并不用于任何商业目的,如果有任何问题,请及时联系我们,我们将根据著作权人的要求,立即更正或者删除有关内容。本站拥有对此声明的最终解释权。

0 个评论

要回复文章请先登录注册