八大直播作业-第三讲-Aringrhus

浏览: 1196
# coding: utf-8

# In[243]:


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta
get_ipython().magic('matplotlib inline')


# In[3]:


import os
os.getcwd()


# In[86]:


from pylab import mpl
mpl.rcParams["font.sans-serif"]=["SimHei"] #设定默认字体
mpl.rcParams["axes.unicode_minus"]=False #解决保存图像是负号“-”显示为方块的问题


# In[8]:


auto_ins = pd.read_csv(r'./auto_ins.csv',encoding='gbk')
auto_ins.info()
# auto_ins.head()


# # homework04

# ## 首先对loss重新编码为1/0,有数值为1,命名为loss_flag

# In[50]:


auto_ins['loss_flag'] = auto_ins.Loss.map(bool).map(int)
auto_ins.tail(10)


# ## 对loss_flag分布情况进行描述分析

# In[35]:


loss_desc = pd.DataFrame(auto_ins.loss_flag.value_counts())
loss_desc['row_sum'] = loss_desc.apply(sum,axis=1).apply(lambda x:round(x/4233,2))
loss_desc.loc['col_sum']=loss_desc.apply(lambda x:x.sum())
loss_desc


# In[45]:


auto_ins.loss_flag.value_counts().plot(kind='bar')

01.png


plt.figure(figsize=(4,4))
paches,texts,autotexts=plt.pie([loss_desc.loc[0,'row_sum'],loss_desc.loc[1,'row_sum']],labels=["0","1"],autopct="%1.1f%%",colors=["#FA8072","#BBFFFF"])

for text in texts+autotexts:
text.set_fontsize(12)
for text in autotexts:
text.set_color("black")

02.png


# ## 分析是否出险和年龄、驾龄、性别、婚姻状态等变量之间的关系  

# In[62]:


cols =list(auto_ins.columns)
cols.remove('Loss')
cols.remove('loss_flag')
cols


# ### Gender

# In[182]:


gender_desc = pd.crosstab(auto_ins['Gender'],auto_ins['loss_flag'])
gender_desc.plot(kind='bar',stacked=True)
# gender_desc

03.png


# In[180]:


def bar_line(data,x_label,y_label,title):
gender_desc = pd.crosstab(data[x_label],data[y_label])
rate_list = list(round(gender_desc[1]/sum(data[y_label]),2))

x = list(gender_desc.index)
y_bottom = list(gender_desc[0])
y_top = list(gender_desc[1])
fig = plt.figure()

ax1 = fig.add_subplot(111)
ax1.grid(True)
# ax1.grid(True,color='b',linewidth='0.3',linestyle='--') #语句不对
ax1.bar(range(len(y_bottom)), y_bottom, label='not default',fc = '#FA8072')
ax1.bar(range(len(y_top)), y_top, bottom=y_bottom, label='default loss',tick_label = x,fc = '#BBFFFF')
ax1.set_ylabel(x_label)
ax1.set_title(title)
ax1.legend()
# ax1.set_xlim()
# ax1.grid(False)


ax2 = ax1.twinx()
ax2.plot(x,rate_list,'')
ax2.set_ylabel(y_label)
ax2.set_xlabel(x)
ax2.grid(False)#, color = "r")
for x, y in zip(x, rate_list):
plt.text(x, y+0.04, str(y), ha='center', va='bottom', fontsize=10.5)


# In[183]:


bar_line(auto_ins,'Gender','loss_flag','性别与逾期关系')

04.png


# ### Marital

# In[181]:


bar_line(auto_ins,'Marital','loss_flag','婚姻状态与逾期关系')

05.png


# ### Owner

# In[185]:


bar_line(auto_ins,'Owner','loss_flag','归属主体与逾期关系')

06.png


# ### Age

# In[225]:


sns.boxplot(x = 'loss_flag', y = 'Age', data = auto_ins)

07.png

# ### vAge

# In[226]:


sns.boxplot(x = 'loss_flag', y = 'vAge', data = auto_ins)

08.png




# HWK5
# # homework05

# In[186]:


os.getcwd()


# In[189]:


clients = pd.read_csv(r"../HW5/clients.csv",encoding='gbk')
print(clients.head())
clients.info()


# In[190]:


card = pd.read_csv(r"../HW5/card.csv",encoding='gbk')
print(card.head())
card.info()


# In[191]:


accounts = pd.read_csv(r"../HW5/accounts.csv",encoding='gbk')
print(accounts.head())
accounts.info()


# In[192]:


disp = pd.read_csv(r"../HW5/disp.csv",encoding='gbk')
print(disp.head())
disp.info()


# ## 不同类型卡的持卡人的性别对比

# In[203]:


card_disp = pd.merge(card,disp[['disp_id','client_id']],how='left',on='disp_id')
card_disp.head(10)
client_card = pd.merge(card_disp,clients,how='left',on='client_id')
client_card.head(10)


# In[209]:


card_sex = pd.crosstab(client_card['type'],client_card['sex'])
# card_sex['sum1'] = card_sex.sum(1)
card_sex = card_sex.div(card_sex.sum(1),axis = 0)
card_sex .plot(kind='bar',stacked=True)

09.png


# ## 不同类型卡的持卡人在办卡时的平均年龄对比

# In[213]:


import datetime
def convert_date(date_str):
date = datetime.datetime.strptime(date_str,'%Y-%m-%d')
return date
client_card['issued_date'] = client_card['issued'].map(convert_date)
client_card['birth_date_t'] = client_card['birth_date'].map(convert_date)


# In[222]:


client_card['issued_age_dt'] = client_card['issued_date']-client_card['birth_date_t']
client_card['issued_age'] = client_card['issued_age_dt'].map(lambda x:round(x.days/365))


# In[223]:


client_card.head()


# In[224]:


client_card['issued_age'].groupby(client_card['type']).mean().plot(kind="bar")

10.png


# ## 不同类型卡的持卡人在办卡前一年内的平均帐户余额对比

# In[236]:


trans = pd.read_csv(r"../HW5/trans.csv",encoding='gbk')
# trans.head()
trans.drop(['trans_id','k_symbol','bank','account'],axis=1,inplace = True)
trans.columns=['account_id', 'trans_date', 'type', 'operation', 'amount', 'balance']
trans.head()


# In[246]:


# trans['trans_date_dt'] = trans['trans_date'].map(convert_date)
trans['trans_pre_year'] = trans['trans_date_dt'].apply(lambda x:x- timedelta(days=365))
trans.head()


# In[249]:


# client_card_account = pd.merge(client_card,disp[['account_id','client_id']],how='left',on='client_id')
client_card_account['issued_pre_year'] = client_card_account['issued_date'].apply(lambda x:x- timedelta(days=365))
client_card_account.head()


# In[250]:


client_card_account_trans = pd.merge(client_card_account,trans[['account_id','type','amount','balance','trans_date_dt']],how='left',on='account_id')
client_card_account_trans.head()


# In[260]:


all_trans_pre_year = client_card_account_trans[client_card_account_trans['trans_date_dt']<=client_card_account_trans['issued_pre_year']]
all_trans_pre_year['balance'] = all_trans_pre_year['balance'].apply(lambda x:int(x.replace('$','').replace(',','')))
all_trans_pre_year['amount'] = all_trans_pre_year['amount'].apply(lambda x:int(x.replace('$','').replace(',','')))
all_trans_pre_year.head()


# In[289]:


client_balance_mean = all_trans_pre_year[['account_id','balance']].groupby(all_trans_pre_year['account_id']).mean()
del client_balance_mean['account_id']
client_balance_mean=client_balance_mean.reset_index()
client_balance_mean = pd.merge(client_balance_mean,all_trans_pre_year[['account_id','type_x']],how='left',on='account_id')
client_balance_mean.drop_duplicates(inplace=True)
client_balance_mean.head()


# In[290]:


sns.boxplot(x='type_x',y='balance',data=client_balance_mean)

11.png



# ## 不同类型卡的持卡人在办卡前一年内的平均收入对比

# In[292]:


all_trans_pre_year_income = all_trans_pre_year[all_trans_pre_year['type_y']=='贷']
all_trans_pre_year_income.head()


# In[295]:


# all_trans_pre_year_income_mean = all_trans_pre_year_income[['account_id','amount']].groupby(all_trans_pre_year_income['account_id']).mean()
# del all_trans_pre_year_income_mean['account_id']
# all_trans_pre_year_income_mean=all_trans_pre_year_income_mean.reset_index()
# all_trans_pre_year_income_mean.head()

all_trans_pre_year_income_mean = pd.merge(all_trans_pre_year_income_mean,all_trans_pre_year[['account_id','type_x']],how='left',on='account_id')
all_trans_pre_year_income_mean.drop_duplicates(inplace=True)
all_trans_pre_year_income_mean.head()


# In[296]:


sns.boxplot(x='type_x',y='amount',data=all_trans_pre_year_income_mean)

12.png

推荐 0
本文由 顺子的 创作,采用 知识共享署名-相同方式共享 3.0 中国大陆许可协议 进行许可。
转载、引用前需联系作者,并署名作者且注明文章出处。
本站文章版权归原作者及原出处所有 。内容为作者个人观点, 并不代表本站赞同其观点和对其真实性负责。本站是一个个人学习交流的平台,并不用于任何商业目的,如果有任何问题,请及时联系我们,我们将根据著作权人的要求,立即更正或者删除有关内容。本站拥有对此声明的最终解释权。

0 个评论

要回复文章请先登录注册