八大直播作业-第五讲-Aringrhus

浏览: 1286
# coding: utf-8

# In[1]:


import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
from scipy import stats
from sklearn.cross_validation import train_test_split
get_ipython().magic('matplotlib inline')


# In[2]:


telecom_churn = pd.read_csv(r'telecom_churn.csv')


# In[3]:


telecom_churn.head()
# In[4]:


cat_cols = ['churn','posTrend','negTrend','prom','curPlan','avgplan','planChange','posPlanChange','negPlanChange']
for col in cat_cols:
telecom_churn[col] = telecom_churn[col].astype(int)

# # 两变量分析:检验该用户通话时长是否呈现出上升态势(posTrend)对流失(churn) 是否有预测价值

# In[5]:


cross1 = pd.crosstab(telecom_churn.posTrend,telecom_churn.churn, margins=True)
cross1

snipaste_20180619_111225.png

# In[6]:


def percConvert(ser):
return ser/float(ser[-1])

cross1.apply(percConvert, axis=1)

snipaste_20180619_111234.png

# In[7]:


cross1.apply(percConvert, axis=0)

snipaste_20180619_111240.png


# In[8]:


cross1.iloc[:2, :2]


# In[9]:


telecom_churn.plot(x='duration', y='churn', kind='scatter')

snipaste_20180619_111249.png

# In[10]:


print('''chisq = %6.4f
p-value = %6.4f
dof = %i
expected_freq = %s''' %stats.chi2_contingency(cross1.iloc[:2, :2]))

snipaste_20180619_111255.png


# # 首先将原始数据拆分为训练和测试数据集,使用训练数据集建立在网时长对流失的逻辑回归,使用测试数据集制作混淆矩阵(阈值为0.5),提供准确性、召回率指标,提供ROC曲线和AUC。

# In[11]:


X_train, X_test, y_train, y_test = train_test_split(telecom_churn.drop('churn',axis=1), telecom_churn['churn'], test_size=0.3, random_state=123)


# In[12]:


pd.concat([X_train,y_train],axis=1)


# In[13]:


lg = smf.glm('churn ~ duration', data=pd.concat([X_train,y_train],axis=1),
family=sm.families.Binomial(sm.families.links.logit)).fit()
lg.summary()

snipaste_20180619_135255.jpg

# In[14]:


X_test.head(10)


# In[15]:


X_train['churn'] = y_train
X_test['proba'] = lg.predict(X_test)
X_train['proba'] = lg.predict(X_train.drop('churn',axis=1))
X_test['churn']= y_test
X_test.head(10)


# In[16]:


X_test['prediction'] = (X_test['proba'] > 0.5).astype('int')


# In[17]:


pd.crosstab(X_test.churn, X_test.prediction, margins=True)

snipaste_20180619_135308.jpg

# In[18]:


# 准确率
acc = sum(X_test['prediction'] == X_test['churn']) /np.float(len(X_test))
print('The accurancy is %.2f' %acc)

snipaste_20180619_135324.jpg

# In[19]:


X_test.loc[(X_test['prediction'] == 1) & (X_test['churn'] == 1)]


# In[20]:


# 召回率
recall = len(X_test[(X_test['prediction'] == 1) & (X_test['churn'] == 1)]) / sum(X_test['churn'] == 1)
print('The recall is %.2f' %recall)
# In[21]:


# ROC曲线

import sklearn.metrics as metrics

fpr_test, tpr_test, th_test = metrics.roc_curve(X_test.churn, X_test.proba)
fpr_train, tpr_train, th_train = metrics.roc_curve(X_train.churn, X_train.proba)

plt.figure(figsize=[12, 12])
plt.plot(fpr_test, tpr_test, 'b--')
plt.plot(fpr_train, tpr_train, '')
plt.title('ROC curve')
plt.show()

snipaste_20180619_135334.jpg


# In[22]:


print('AUC = %.4f' %metrics.auc(fpr_test, tpr_test))

snipaste_20180619_135340.jpg


# # 使用向前逐步法从其它备选变量中选择变量,构建基于AIC的最优模型,绘制ROC曲线,同时检验模型的膨胀系数

# In[23]:


# 向前法
def forward_select(data, response):
remaining = set(data.columns)
remaining.remove(response)
selected = []
current_score, best_new_score = float('inf'), float('inf')
while remaining:
aic_with_candidates=[]
for candidate in remaining:
formula = "{} ~ {}".format(
response,' + '.join(selected + [candidate]))
aic = smf.glm(
formula=formula, data=data,
family=sm.families.Binomial(sm.families.links.logit)
).fit().aic
aic_with_candidates.append((aic, candidate))
aic_with_candidates.sort(reverse=True)
best_new_score, best_candidate=aic_with_candidates.pop()
if current_score > best_new_score:
remaining.remove(best_candidate)
selected.append(best_candidate)
current_score = best_new_score
print ('aic is {},continuing!'.format(current_score))
else:
print ('forward selection over!')
break

formula = "{} ~ {} ".format(response,' + '.join(selected))
print('final formula is {}'.format(formula))
model = smf.glm(
formula=formula, data=data,
family=sm.families.Binomial(sm.families.links.logit)
).fit()
return(model)
# In[24]:


candidates = ['churn','duration','AGE','edu_class','posTrend','negTrend','nrProm','prom','curPlan','avgplan','planChange','incomeCode','feton','peakMinAv','peakMinDiff','call_10086']
data_for_select = X_train[candidates]
lg_m1 = forward_select(data=data_for_select, response='churn')
lg_m1.summary()

snipaste_20180619_135624.jpg

# In[25]:


def vif(df, col_i):
from statsmodels.formula.api import ols

cols = list(df.columns)
cols.remove(col_i)
cols_noti = cols
formula = col_i + '~' + '+'.join(cols_noti)
r2 = ols(formula, df).fit().rsquared
return 1. / (1. - r2)
# In[26]:


exog = X_train[candidates].drop(['churn'], axis=1)
for i in exog.columns:
print(i, '\t', vif(df=exog, col_i=i))

snipaste_20180619_135634.jpg


# In[27]:


lg2 = smf.glm('churn ~ duration + feton + peakMinDiff + call_10086 + AGE + edu_class + prom + nrProm + posTrend + incomeCode + peakMinAv', data=X_train.drop(['proba'],axis=1),
family=sm.families.Binomial(sm.families.links.logit)).fit()
lg2.summary()

snipaste_20180619_135641.jpg

# In[28]:


# X_train.head()
X_test.head()


# In[29]:


X_train['proba2'] = lg2.predict(X_train.drop(['churn','proba'],axis=1))
X_test['proba2'] = lg2.predict(X_test.drop(['churn','proba','prediction'],axis=1))
print(X_train.head(10))
print(X_test.head(10))


# In[30]:


X_test['prediction2'] = (X_test['proba2'] > 0.5).astype('int')
# In[31]:


# 准确率
acc = sum(X_test['prediction2'] == X_test['churn']) /np.float(len(X_test))
print('The accurancy is %.2f' %acc)

snipaste_20180619_135652.jpg

# In[32]:


# 召回率
recall = len(X_test[(X_test['prediction2'] == 1) & (X_test['churn'] == 1)]) / sum(X_test['churn'] == 1)
print('The recall is %.2f' %recall)

snipaste_20180619_135657.jpg

# In[33]:


# ROC曲线

import sklearn.metrics as metrics

fpr_test, tpr_test, th_test = metrics.roc_curve(X_test.churn, X_test.proba2)
fpr_train, tpr_train, th_train = metrics.roc_curve(X_train.churn, X_train.proba2)

plt.figure(figsize=[12, 12])
plt.plot(fpr_test, tpr_test, 'b--')
plt.plot(fpr_train, tpr_train, '')
plt.title('ROC curve')
plt.show()

snipaste_20180619_135710.jpg

# In[34]:


print('AUC = %.4f' %metrics.auc(fpr_test, tpr_test))

snipaste_20180619_135715.jpg

推荐 0
本文由 顺子的 创作,采用 知识共享署名-相同方式共享 3.0 中国大陆许可协议 进行许可。
转载、引用前需联系作者,并署名作者且注明文章出处。
本站文章版权归原作者及原出处所有 。内容为作者个人观点, 并不代表本站赞同其观点和对其真实性负责。本站是一个个人学习交流的平台,并不用于任何商业目的,如果有任何问题,请及时联系我们,我们将根据著作权人的要求,立即更正或者删除有关内容。本站拥有对此声明的最终解释权。

0 个评论

要回复文章请先登录注册