# coding: utf-8
# In[1]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
os.chdir(r"D:\learningFiels\天善常课\061第6讲\提交-第六讲:使用决策树和神经网络预测客户流失倾向\作业")
get_ipython().magic('matplotlib inline')
# In[2]:
marrige = pd.read_csv('date_data2.csv')
marrige.head()
# income-月均收入(元)
#
# attractive-由婚恋网站评定出的个人魅力值,分值从0-100。
#
# assets-资产(万元)
#
# edueduclass-教育等级:1=小学,2=初中;3=高中,4=本科,5=硕士及以上
#
# Dated-是否相亲成功:1代表成功
# # 数据探索
# In[23]:
print(marrige.info())
marrige.describe()
# In[16]:
sns.boxplot(x='Dated',y='income',data=marrige)
# In[17]:
sns.boxplot(x='Dated',y='attractive',data=marrige)
# In[19]:
sns.boxplot(x='Dated',y='assets',data=marrige)
# In[22]:
marrige.groupby('edueduclass')['Dated'].mean().plot(kind='bar')
# # 初版决策树
# In[25]:
import sklearn.model_selection as cross_validation
target = marrige['Dated']
data = marrige.iloc[:,:-1]
data.head()
# In[26]:
X_train,X_test,y_train,y_test = cross_validation.train_test_split(data,target,train_size=0.7,random_state=123)
# In[31]:
import sklearn.tree as tree
# In[65]:
clf = tree.DecisionTreeClassifier(criterion='entropy',max_depth=8,min_samples_split=3,min_samples_leaf=2)
clf.fit(X_train,y_train)
# In[66]:
train_pre = clf.predict(X_train)
train_pre_p = clf.predict_proba(X_train)[:,1]
# In[67]:
test_pre = clf.predict(X_test)
test_pre_p = clf.predict_proba(X_test)[:,1]
# In[68]:
pd.DataFrame({'test_target':y_test,'test_est':test_pre,'test_est_p':test_pre_p})
# In[69]:
import sklearn.metrics as metrics
# In[70]:
print(metrics.confusion_matrix(y_test,test_pre,labels=[0,1]))
# In[71]:
y_test.sum()
# In[72]:
test_pre.sum()
# In[73]:
print(metrics.classification_report(y_test,test_pre))
# In[74]:
print(pd.DataFrame(list(zip(data.columns,clf.feature_importances_))))
# In[75]:
fpr_test,tpr_test,th_test = metrics.roc_curve(y_test,test_pre)
# In[84]:
print(fpr_test,tpr_test,th_test)
# In[77]:
fpr_train,tpr_train,th_train = metrics.roc_curve(y_train,train_pre)
# In[86]:
print(fpr_train,tpr_train,th_train)
# In[78]:
red, blue = sns.color_palette("Set1", 2)
# In[79]:
plt.figure(figsize=[6,6])
plt.plot(fpr_test, tpr_test, color=blue)
plt.plot(fpr_train, tpr_train, color=red)
plt.title('ROC curve')
plt.show()
# In[83]:
print('AUC = %.4f' %metrics.auc(fpr_test, tpr_test))
# # 初版神经网络
# In[4]:
import sklearn.model_selection as cross_validation
target = marrige['Dated']
data = marrige.iloc[:,:-1]
data.head()
# In[5]:
X_train,X_test,y_train,y_test = cross_validation.train_test_split(data,target,train_size=0.7,random_state=123)
# In[8]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
scaled_train_data = scaler.transform(X_train)
scaled_test_data = scaler.transform(X_test)
# In[10]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(10,),
activation='logistic', alpha=0.1, max_iter=1000)
mlp.fit(scaled_train_data, y_train)
mlp
# In[11]:
train_predict = mlp.predict(scaled_train_data)
test_predict = mlp.predict(scaled_test_data)
# In[12]:
# 计算概率
train_proba = mlp.predict_proba(scaled_train_data)[:, 1]
test_proba = mlp.predict_proba(scaled_test_data)[:, 1]
# In[13]:
from sklearn import metrics
print(metrics.confusion_matrix(y_test, test_predict, labels=[0, 1]))
print(metrics.classification_report(y_test, test_predict))
# In[15]:
mlp.score(scaled_test_data, y_test)
# In[16]:
fpr_test, tpr_test, th_test = metrics.roc_curve(y_test, test_proba)
fpr_train, tpr_train, th_train = metrics.roc_curve(y_train, train_proba)
plt.figure(figsize=[4, 4])
plt.plot(fpr_test, tpr_test, '')
plt.plot(fpr_train, tpr_train, '')
plt.title('ROC curve')
plt.show()
print('AUC = %6.4f' %metrics.auc(fpr_test, tpr_test))
# # 网格搜索寻参
# In[17]:
import sklearn.tree as tree
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import sklearn.model_selection as cross_validation
train_data, test_data, train_target, test_target = cross_validation.train_test_split(data,target, test_size=0.4, train_size=0.6 ,random_state=12345)
# In[18]:
param_grid = {
'criterion':['entropy','gini'],
'max_depth':[2,3,4],
'min_samples_split':[2,4,6,8,10]
}
clf = tree.DecisionTreeClassifier()
clfcv = GridSearchCV(estimator=clf, param_grid=param_grid,
scoring='roc_auc', cv=4)
clfcv.fit(train_data, train_target)
clfcv.best_params_
# In[19]:
clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_split=2)
clf.fit(train_data, train_target)
train_est = clf.predict(train_data)
train_est_p=clf.predict_proba(train_data)[:,1]
test_est=clf.predict(test_data)
test_est_p=clf.predict_proba(test_data)[:,1]
pd.DataFrame({'test_target':test_target,'test_est':test_est,'test_est_p':test_est_p})
# In[20]:
import matplotlib.pyplot as plt
fpr_test, tpr_test, th_test = metrics.roc_curve(test_target, test_est_p)
fpr_train, tpr_train, th_train = metrics.roc_curve(train_target, train_est_p)
plt.figure(figsize=[6,6])
plt.plot(fpr_test, tpr_test)
plt.plot(fpr_train, tpr_train)
plt.title('ROC curve')
plt.show()
print('AUC = %6.4f' %metrics.auc(fpr_test, tpr_test))
# In[21]:
import pydotplus
from IPython.display import Image
import sklearn.tree as tree
dot_data = tree.export_graphviz(
clf,
out_file=None,
feature_names=data.columns,
max_depth=5,
class_names=['0','1'],
filled=True
)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
# In[22]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(train_data)
scaled_train_data = scaler.transform(train_data)
scaled_test_data = scaler.transform(test_data)
# In[23]:
from sklearn.neural_network import MLPClassifier
param_grid = {
'hidden_layer_sizes':[(10, ), (15, ), (20, ), (5, 5)],
'activation':['logistic', 'tanh', 'relu'],
'alpha':[0.001, 0.01, 0.1, 0.2, 0.4, 1, 10]
}
mlp = MLPClassifier(max_iter=1000)
gcv = GridSearchCV(estimator=mlp, param_grid=param_grid,
scoring='roc_auc', cv=4, n_jobs=-1)
gcv.fit(scaled_train_data, train_target)
gcv.best_params_
test_est_p=gcv.predict_proba(scaled_test_data)[:,1]
train_est_p=gcv.predict_proba(scaled_train_data)[:,1]
# In[24]:
fpr_test, tpr_test, th_test = metrics.roc_curve(test_target, test_est_p)
fpr_train, tpr_train, th_train = metrics.roc_curve(train_target, train_est_p)
plt.figure(figsize=[4, 4])
plt.plot(fpr_test, tpr_test, '')
plt.plot(fpr_train, tpr_train, '')
plt.title('ROC curve')
plt.show()
print('AUC = %6.4f' %metrics.auc(fpr_test, tpr_test))
# # 变量分箱
# In[25]:
import os
import pandas as pd
data = pd.read_csv('date_data2.csv', skipinitialspace=True)
target = data['Dated']
data = data.ix[:, :4]
data.head()
# In[26]:
import sklearn.tree as tree
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import sklearn.model_selection as cross_validation
train_data, test_data, train_target, test_target = cross_validation.train_test_split(data,target, test_size=0.4, train_size=0.6 ,random_state=12345) # 划分训练集和测试集
# In[27]:
param_grid = {
'criterion':['entropy','gini'],
'max_depth':[2,3,4],
'min_samples_split':[2,4,6,8,10]
}
clf = tree.DecisionTreeClassifier()
clfcv = GridSearchCV(estimator=clf, param_grid=param_grid,
scoring='roc_auc', cv=4)
clfcv.fit(train_data, train_target)
#%%
clfcv.best_params_
# In[28]:
clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_split=2)
clf.fit(train_data, train_target)
train_est = clf.predict(train_data)
train_est_p=clf.predict_proba(train_data)[:,1]
test_est=clf.predict(test_data)
test_est_p=clf.predict_proba(test_data)[:,1]
pd.DataFrame({'test_target':test_target,'test_est':test_est,'test_est_p':test_est_p})
# In[29]:
import matplotlib.pyplot as plt
fpr_test, tpr_test, th_test = metrics.roc_curve(test_target, test_est_p)
fpr_train, tpr_train, th_train = metrics.roc_curve(train_target, train_est_p)
plt.figure(figsize=[6,6])
plt.plot(fpr_test, tpr_test)
plt.plot(fpr_train, tpr_train)
plt.title('ROC curve')
plt.show()
print('AUC = %6.4f' %metrics.auc(fpr_test, tpr_test))
# In[30]:
import pydotplus
from IPython.display import Image
import sklearn.tree as tree
dot_data = tree.export_graphviz(
clf,
out_file=None,
feature_names=data.columns,
max_depth=5,
class_names=['0','1'],
filled=True
)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
# In[31]:
train_data, test_data, train_target, test_target = cross_validation.train_test_split(data,target, test_size=0.4, train_size=0.6 ,random_state=12345)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(train_data)
scaled_train_data = scaler.transform(train_data)
scaled_test_data = scaler.transform(test_data)
# In[32]:
from sklearn.neural_network import MLPClassifier
param_grid = {
'hidden_layer_sizes':[(10, ), (15, ), (20, ), (5, 5)],
'activation':['logistic', 'tanh', 'relu'],
'alpha':[0.001, 0.01, 0.1, 0.2, 0.4, 1, 10]
}
mlp = MLPClassifier(max_iter=1000)
gcv = GridSearchCV(estimator=mlp, param_grid=param_grid,
scoring='roc_auc', cv=4, n_jobs=-1)
gcv.fit(scaled_train_data, train_target)
gcv.best_params_
# In[33]:
test_est_p=gcv.predict_proba(scaled_test_data)[:,1]
train_est_p=gcv.predict_proba(scaled_train_data)[:,1]
# In[34]:
fpr_test, tpr_test, th_test = metrics.roc_curve(test_target, test_est_p)
fpr_train, tpr_train, th_train = metrics.roc_curve(train_target, train_est_p)
plt.figure(figsize=[4, 4])
plt.plot(fpr_test, tpr_test, '')
plt.plot(fpr_train, tpr_train, '')
plt.title('ROC curve')
plt.show()
print('AUC = %6.4f' %metrics.auc(fpr_test, tpr_test))
# In[ ]:
本站文章版权归原作者及原出处所有 。内容为作者个人观点, 并不代表本站赞同其观点和对其真实性负责。本站是一个个人学习交流的平台,并不用于任何商业目的,如果有任何问题,请及时联系我们,我们将根据著作权人的要求,立即更正或者删除有关内容。本站拥有对此声明的最终解释权。