HW6

浏览: 1020

研究二手房价的影响因素,建立放假影响模型

# -*- coding: utf-8 -*-
"""
@author: Rachel
"""
# In[]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats #scipy.stats包括统计工具和随机过程的概率过程。
import statsmodels.api as sm
from statsmodels.formula.api import ols
from matplotlib import pyplot as plt

# In[]:
os.chdir(r"E:\Python_learning\data_science\task_0529\HW6")
house = pd.read_csv('sndHsPr.csv')
house.head()

# In[]:
dist = {'fengtai':'丰台区','haidian':'海淀区','chaoyang':'朝阳区','dongcheng':'东城区','xicheng':'西城区','shijingshan':'石景山区'}
house['dist1'] = house.dist.map(dist)
house.head()
# In[]:
#(一)单位面积房价分析
house['price'].describe()

# In[]:
sns.distplot(house.price,kde=True,fit=stats.norm)
fig1 = sm.qqplot(house.price,fit=True,line='45')
fig1.show()

# In[]:
house['price'].plot(kind='box')

#(二) 自变量分析:
# In[]:
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体
mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
# In[]:
#2.1自变量自身分布分析
data1 = ['dist1','roomnum','halls','floor','subway','school']
data2 = ['AREA']
for i in data1:
print('name:',i)
print(house[i].value_counts())
house[i].value_counts().plot(kind = 'bar')
plt.show()

for i in data2:
print('name:',i)
print(house[i].describe())
sns.distplot(house[i],kde=True,fit=stats.norm)
fig = sm.qqplot(house[i],fit=True,line='45')
fig.show()

# In[]:
#2.2自变量对因变量影响分析
data3=['dist1','roomnum','halls','floor']
data4=['subway','school']
data5=['AREA']

# 两样本T检验
for i in data4:
Suc0 = house[house[i] == 0]['price']
Suc1 = house[house[i] == 1]['price']
leveneTestRes = stats.levene(Suc0, Suc1, center='median')
print('w-value=%6.4f, p-value=%6.4f' %leveneTestRes)
print(stats.stats.ttest_ind(Suc0, Suc1, equal_var=False),'\n')
# In[]:
# 多分类方差分析
for i in data3:
print('name:',i)
print(sm.stats.anova_lm(ols('price ~ C(house[i])',data=house).fit()),'\n')
# In[]:
#相关性分析
for i in data5:
print('name:',i)
house.plot(x=i, y='price', kind='scatter')
plt.show()
print(house[[i, 'price']].corr(method='pearson'),"\n")

# (三)建立房价预测模型
# In[]:
#3.1 线性回归模型
'''forward select'''
def forward_select(data, response):
remaining = set(data.columns)
remaining.remove(response)
selected = []
current_score, best_new_score = float('inf'), float('inf')
while remaining:
aic_with_candidates=[]
for candidate in remaining:
formula = "{} ~ {}".format(
response,' + '.join(selected + [candidate]))
aic = ols(formula=formula, data=data).fit().aic
aic_with_candidates.append((aic, candidate))
aic_with_candidates.sort(reverse=True)
best_new_score, best_candidate=aic_with_candidates.pop()
if current_score > best_new_score:
remaining.remove(best_candidate)
selected.append(best_candidate)
current_score = best_new_score
print ('aic is {},continuing!'.format(current_score))
else:
print ('forward selection over!')
break

formula = "{} ~ {} ".format(response,' + '.join(selected))
print('final formula is {}'.format(formula))
model = ols(formula=formula, data=data).fit()
return(model)
# %%
#sm.stats.anova_lm(ols('price ~ C(dist1)+C(roomnum)+C(halls)+C(floor)+C(subway)+C(school)+C(AREA)',data=house).fit())
data_for_select = house[['price', 'dist1', 'roomnum', 'halls',
'floor','subway','school','AREA']]
lm_m = forward_select(data=data_for_select, response='price')
print(lm_m.rsquared)

# %%
#3.2 对因变量取对数的线性模型
data_for_select_lg = data_for_select.copy()
data_for_select_lg['price_lg'] = data_for_select_lg['price'].apply(lambda x: np.log(x))
data_for_select_lg.drop(['price'],axis=1,inplace=True)
lm_m_lg = forward_select(data=data_for_select_lg, response='price_lg')
print(lm_m_lg.rsquared)

# In[]:
# 3.3 考虑交互项的对数线性
data_for_select_lg['AREA_lg']=data_for_select_lg[['AREA']].apply(lambda x: np.log(x))
data_for_select_lg.drop(['AREA'],axis=1,inplace=True)
lm_m_lg = forward_select(data=data_for_select_lg, response='price_lg')
print(lm_m_lg.rsquared)

#%%
#(四)预测: 假设有一家三口,父母为了能让孩子在东城区上学,想买一套邻近地铁的两居室,面积是70平方米,中层楼层,那么房价大约是多少呢?

lst=[{"dist1":"东城区","roomnum":2,"halls":0,"floor":"middle",
"subway":1,"school":1,"AREA_lg":np.log(70),"price_lg":0}]
df = pd.DataFrame(lst)
lm_m_lg.predict(df)

一、因变量分析:单位面积房价分析

count     16210.000000
mean 61151.810919
std 22293.358147
min 18348.000000
25% 42812.250000
50% 57473.000000
75% 76099.750000
max 149871.000000
Name: price, dtype: float64

image.png

image.png

image.png


二、自变量分析

2.1 自变量自身分布分析

name: dist1
丰台区 2947
海淀区 2919
朝阳区 2864
东城区 2783
西城区 2750
石景山区 1947
Name: dist1, dtype: int64

image.png

name: roomnum
2 7971
3 4250
1 3212
4 675
5 102
Name: roomnum, dtype: int64

image.png

name: halls
1 11082
2 4231
0 812
3 85
Name: halls, dtype: int64

image.png

name: floor
middle 5580
high 5552
low 5078
Name: floor, dtype: int64

image.png

name: subway
1 13419
0 2791
Name: subway, dtype: int64

image.png

name: school
0 11297
1 4913
Name: school, dtype: int64

image.png

name: AREA
count 16210.000000
mean 91.746598
std 44.000768
min 30.060000
25% 60.000000
50% 78.830000
75% 110.517500
max 299.000000
Name: AREA, dtype: float64

image.png

image.png

2.2 自变量对因变量影响分析

name: subway
w-value=328.1249, p-value=0.0000
Ttest_indResult(statistic=-38.41845699514431, pvalue=3.207202424810418e-281)

name: school
w-value=58.3647, p-value=0.0000
Ttest_indResult(statistic=-77.29591891711223, pvalue=0.0)
name: dist1
df sum_sq mean_sq F PR(>F)
C(house[i]) 5.0 4.215655e+12 8.431310e+11 3557.727873 0.0
Residual 16204.0 3.840118e+12 2.369858e+08 NaN NaN

name: roomnum
df sum_sq mean_sq F PR(>F)
C(house[i]) 4.0 6.703058e+09 1.675764e+09 3.373776 0.009113
Residual 16205.0 8.049070e+12 4.967029e+08 NaN NaN

name: halls
df sum_sq mean_sq F PR(>F)
C(house[i]) 3.0 2.917226e+10 9.724088e+09 19.63329 1.063322e-12
Residual 16206.0 8.026601e+12 4.952857e+08 NaN NaN

name: floor
df sum_sq mean_sq F PR(>F)
C(house[i]) 2.0 1.999790e+10 9.998948e+09 20.166437 1.789401e-09
Residual 16207.0 8.035775e+12 4.958212e+08 NaN NaN
name: AREA
AREA price
AREA 1.000000 -0.073955
price -0.073955 1.000000

image.png


三、建立房价模型

3.1 线性回归模型

aic is 358593.69581283897,continuing!
aic is 356727.2565217991,continuing!
aic is 356251.24413644033,continuing!
aic is 356169.91982609546,continuing!
aic is 356005.8440516004,continuing!
aic is 355957.1383683952,continuing!
aic is 355929.32364125375,continuing!
final formula is price ~ dist1 + school + subway + halls + AREA + floor + roomnum
rsquared: 0.5959091332283823

3.2 对因变量取对数的线性模型

aic is 407.21691478716093,continuing!
aic is -1169.6133484040984,continuing!
aic is -1838.5011115886227,continuing!
aic is -1972.8738713787243,continuing!
aic is -2156.0212286889,continuing!
aic is -2214.6759667986043,continuing!
aic is -2230.826095094548,continuing!
final formula is price_lg ~ dist1 + school + subway + halls + AREA + floor + roomnum
rsquared: 0.6154245082337468

3.3 考虑交互项的对数线性

aic is 407.21691478716093,continuing!
aic is -1169.6133484040984,continuing!
aic is -1838.5011115886227,continuing!
aic is -1972.8738713787243,continuing!
aic is -2247.2296682713277,continuing!
aic is -2308.0787091857637,continuing!
aic is -2358.420282359053,continuing!
final formula is price_lg ~ dist1 + school + subway + halls + AREA_lg + floor + roomnum
rsquared: 0.6184397446086782

四、预测:假设有一家三口,父母为了能让孩子在东城区上学,想买一套邻近地铁的两居室,面积是70平方米,中层楼层,预测房价。

0    11.169971
dtype: float64
推荐 0
本文由 这一秒不放弃 创作,采用 知识共享署名-相同方式共享 3.0 中国大陆许可协议 进行许可。
转载、引用前需联系作者,并署名作者且注明文章出处。
本站文章版权归原作者及原出处所有 。内容为作者个人观点, 并不代表本站赞同其观点和对其真实性负责。本站是一个个人学习交流的平台,并不用于任何商业目的,如果有任何问题,请及时联系我们,我们将根据著作权人的要求,立即更正或者删除有关内容。本站拥有对此声明的最终解释权。

0 个评论

要回复文章请先登录注册