八大直播作业-第六讲-Aringrhus

浏览: 1043

# coding: utf-8

# In[1]:


import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
os.chdir(r"D:\learningFiels\天善常课\061第6讲\提交-第六讲:使用决策树和神经网络预测客户流失倾向\作业")
get_ipython().magic('matplotlib inline')


# In[2]:


marrige = pd.read_csv('date_data2.csv')
marrige.head()


# income-月均收入(元)
#
# attractive-由婚恋网站评定出的个人魅力值,分值从0-100。
#
# assets-资产(万元)
#
# edueduclass-教育等级:1=小学,2=初中;3=高中,4=本科,5=硕士及以上
#
# Dated-是否相亲成功:1代表成功

# # 数据探索

# In[23]:


print(marrige.info())
marrige.describe()


# In[16]:


sns.boxplot(x='Dated',y='income',data=marrige)


# In[17]:


sns.boxplot(x='Dated',y='attractive',data=marrige)


# In[19]:


sns.boxplot(x='Dated',y='assets',data=marrige)


# In[22]:


marrige.groupby('edueduclass')['Dated'].mean().plot(kind='bar')


# # 初版决策树

# In[25]:


import sklearn.model_selection as cross_validation

target = marrige['Dated']
data = marrige.iloc[:,:-1]

data.head()


# In[26]:


X_train,X_test,y_train,y_test = cross_validation.train_test_split(data,target,train_size=0.7,random_state=123)


# In[31]:


import sklearn.tree as tree


# In[65]:


clf = tree.DecisionTreeClassifier(criterion='entropy',max_depth=8,min_samples_split=3,min_samples_leaf=2)
clf.fit(X_train,y_train)


# In[66]:


train_pre = clf.predict(X_train)
train_pre_p = clf.predict_proba(X_train)[:,1]


# In[67]:


test_pre = clf.predict(X_test)
test_pre_p = clf.predict_proba(X_test)[:,1]


# In[68]:


pd.DataFrame({'test_target':y_test,'test_est':test_pre,'test_est_p':test_pre_p})


# In[69]:


import sklearn.metrics as metrics


# In[70]:


print(metrics.confusion_matrix(y_test,test_pre,labels=[0,1]))


# In[71]:


y_test.sum()


# In[72]:


test_pre.sum()


# In[73]:


print(metrics.classification_report(y_test,test_pre))


# In[74]:


print(pd.DataFrame(list(zip(data.columns,clf.feature_importances_))))


# In[75]:


fpr_test,tpr_test,th_test = metrics.roc_curve(y_test,test_pre)


# In[84]:


print(fpr_test,tpr_test,th_test)


# In[77]:


fpr_train,tpr_train,th_train = metrics.roc_curve(y_train,train_pre)


# In[86]:


print(fpr_train,tpr_train,th_train)


# In[78]:


red, blue = sns.color_palette("Set1", 2)


# In[79]:


plt.figure(figsize=[6,6])
plt.plot(fpr_test, tpr_test, color=blue)
plt.plot(fpr_train, tpr_train, color=red)
plt.title('ROC curve')
plt.show()


# In[83]:


print('AUC = %.4f' %metrics.auc(fpr_test, tpr_test))


# # 初版神经网络

# In[4]:


import sklearn.model_selection as cross_validation

target = marrige['Dated']
data = marrige.iloc[:,:-1]

data.head()


# In[5]:


X_train,X_test,y_train,y_test = cross_validation.train_test_split(data,target,train_size=0.7,random_state=123)


# In[8]:


from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_train)

scaled_train_data = scaler.transform(X_train)
scaled_test_data = scaler.transform(X_test)


# In[10]:


from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(10,),
activation='logistic', alpha=0.1, max_iter=1000)

mlp.fit(scaled_train_data, y_train)
mlp


# In[11]:


train_predict = mlp.predict(scaled_train_data)
test_predict = mlp.predict(scaled_test_data)


# In[12]:


# 计算概率
train_proba = mlp.predict_proba(scaled_train_data)[:, 1]
test_proba = mlp.predict_proba(scaled_test_data)[:, 1]


# In[13]:


from sklearn import metrics

print(metrics.confusion_matrix(y_test, test_predict, labels=[0, 1]))
print(metrics.classification_report(y_test, test_predict))


# In[15]:


mlp.score(scaled_test_data, y_test)


# In[16]:


fpr_test, tpr_test, th_test = metrics.roc_curve(y_test, test_proba)
fpr_train, tpr_train, th_train = metrics.roc_curve(y_train, train_proba)

plt.figure(figsize=[4, 4])
plt.plot(fpr_test, tpr_test, '')
plt.plot(fpr_train, tpr_train, '')
plt.title('ROC curve')
plt.show()

print('AUC = %6.4f' %metrics.auc(fpr_test, tpr_test))


# # 网格搜索寻参

# In[17]:



import sklearn.tree as tree
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

import sklearn.model_selection as cross_validation
train_data, test_data, train_target, test_target = cross_validation.train_test_split(data,target, test_size=0.4, train_size=0.6 ,random_state=12345)



# In[18]:


param_grid = {
'criterion':['entropy','gini'],
'max_depth':[2,3,4],
'min_samples_split':[2,4,6,8,10]
}
clf = tree.DecisionTreeClassifier()
clfcv = GridSearchCV(estimator=clf, param_grid=param_grid,
scoring='roc_auc', cv=4)
clfcv.fit(train_data, train_target)

clfcv.best_params_



# In[19]:


clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_split=2)
clf.fit(train_data, train_target)


train_est = clf.predict(train_data)
train_est_p=clf.predict_proba(train_data)[:,1]
test_est=clf.predict(test_data)
test_est_p=clf.predict_proba(test_data)[:,1]
pd.DataFrame({'test_target':test_target,'test_est':test_est,'test_est_p':test_est_p})



# In[20]:


import matplotlib.pyplot as plt
fpr_test, tpr_test, th_test = metrics.roc_curve(test_target, test_est_p)
fpr_train, tpr_train, th_train = metrics.roc_curve(train_target, train_est_p)
plt.figure(figsize=[6,6])
plt.plot(fpr_test, tpr_test)
plt.plot(fpr_train, tpr_train)
plt.title('ROC curve')
plt.show()

print('AUC = %6.4f' %metrics.auc(fpr_test, tpr_test))



# In[21]:


import pydotplus
from IPython.display import Image
import sklearn.tree as tree

dot_data = tree.export_graphviz(
clf,
out_file=None,
feature_names=data.columns,
max_depth=5,
class_names=['0','1'],
filled=True
)

graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())


# In[22]:



from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(train_data)
scaled_train_data = scaler.transform(train_data)
scaled_test_data = scaler.transform(test_data)


# In[23]:



from sklearn.neural_network import MLPClassifier
param_grid = {
'hidden_layer_sizes':[(10, ), (15, ), (20, ), (5, 5)],
'activation':['logistic', 'tanh', 'relu'],
'alpha':[0.001, 0.01, 0.1, 0.2, 0.4, 1, 10]
}
mlp = MLPClassifier(max_iter=1000)
gcv = GridSearchCV(estimator=mlp, param_grid=param_grid,
scoring='roc_auc', cv=4, n_jobs=-1)
gcv.fit(scaled_train_data, train_target)

gcv.best_params_

test_est_p=gcv.predict_proba(scaled_test_data)[:,1]
train_est_p=gcv.predict_proba(scaled_train_data)[:,1]


# In[24]:



fpr_test, tpr_test, th_test = metrics.roc_curve(test_target, test_est_p)
fpr_train, tpr_train, th_train = metrics.roc_curve(train_target, train_est_p)
plt.figure(figsize=[4, 4])
plt.plot(fpr_test, tpr_test, '')
plt.plot(fpr_train, tpr_train, '')
plt.title('ROC curve')
plt.show()
print('AUC = %6.4f' %metrics.auc(fpr_test, tpr_test))


# # 变量分箱

# In[25]:


import os
import pandas as pd

data = pd.read_csv('date_data2.csv', skipinitialspace=True)

target = data['Dated']
data = data.ix[:, :4]
data.head()



# In[26]:



import sklearn.tree as tree
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

import sklearn.model_selection as cross_validation
train_data, test_data, train_target, test_target = cross_validation.train_test_split(data,target, test_size=0.4, train_size=0.6 ,random_state=12345) # 划分训练集和测试集


# In[27]:



param_grid = {
'criterion':['entropy','gini'],
'max_depth':[2,3,4],
'min_samples_split':[2,4,6,8,10]
}
clf = tree.DecisionTreeClassifier()
clfcv = GridSearchCV(estimator=clf, param_grid=param_grid,
scoring='roc_auc', cv=4)
clfcv.fit(train_data, train_target)
#%%
clfcv.best_params_


# In[28]:



clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_split=2)
clf.fit(train_data, train_target)


train_est = clf.predict(train_data)
train_est_p=clf.predict_proba(train_data)[:,1]
test_est=clf.predict(test_data)
test_est_p=clf.predict_proba(test_data)[:,1]
pd.DataFrame({'test_target':test_target,'test_est':test_est,'test_est_p':test_est_p})




# In[29]:


import matplotlib.pyplot as plt
fpr_test, tpr_test, th_test = metrics.roc_curve(test_target, test_est_p)
fpr_train, tpr_train, th_train = metrics.roc_curve(train_target, train_est_p)
plt.figure(figsize=[6,6])
plt.plot(fpr_test, tpr_test)
plt.plot(fpr_train, tpr_train)
plt.title('ROC curve')
plt.show()

print('AUC = %6.4f' %metrics.auc(fpr_test, tpr_test))


# In[30]:


import pydotplus
from IPython.display import Image
import sklearn.tree as tree

dot_data = tree.export_graphviz(
clf,
out_file=None,
feature_names=data.columns,
max_depth=5,
class_names=['0','1'],
filled=True
)

graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())


# In[31]:



train_data, test_data, train_target, test_target = cross_validation.train_test_split(data,target, test_size=0.4, train_size=0.6 ,random_state=12345)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(train_data)
scaled_train_data = scaler.transform(train_data)
scaled_test_data = scaler.transform(test_data)



# In[32]:


from sklearn.neural_network import MLPClassifier
param_grid = {
'hidden_layer_sizes':[(10, ), (15, ), (20, ), (5, 5)],
'activation':['logistic', 'tanh', 'relu'],
'alpha':[0.001, 0.01, 0.1, 0.2, 0.4, 1, 10]
}

mlp = MLPClassifier(max_iter=1000)
gcv = GridSearchCV(estimator=mlp, param_grid=param_grid,
scoring='roc_auc', cv=4, n_jobs=-1)
gcv.fit(scaled_train_data, train_target)


gcv.best_params_


# In[33]:


test_est_p=gcv.predict_proba(scaled_test_data)[:,1]
train_est_p=gcv.predict_proba(scaled_train_data)[:,1]


# In[34]:


fpr_test, tpr_test, th_test = metrics.roc_curve(test_target, test_est_p)
fpr_train, tpr_train, th_train = metrics.roc_curve(train_target, train_est_p)
plt.figure(figsize=[4, 4])
plt.plot(fpr_test, tpr_test, '')
plt.plot(fpr_train, tpr_train, '')
plt.title('ROC curve')
plt.show()
print('AUC = %6.4f' %metrics.auc(fpr_test, tpr_test))


# In[ ]:





推荐 0
本文由 顺子的 创作,采用 知识共享署名-相同方式共享 3.0 中国大陆许可协议 进行许可。
转载、引用前需联系作者,并署名作者且注明文章出处。
本站文章版权归原作者及原出处所有 。内容为作者个人观点, 并不代表本站赞同其观点和对其真实性负责。本站是一个个人学习交流的平台,并不用于任何商业目的,如果有任何问题,请及时联系我们,我们将根据著作权人的要求,立即更正或者删除有关内容。本站拥有对此声明的最终解释权。

0 个评论

要回复文章请先登录注册