研究二手房价的影响因素,建立放假影响模型
# -*- coding: utf-8 -*-
"""
@author: Rachel
"""
# In[]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats #scipy.stats包括统计工具和随机过程的概率过程。
import statsmodels.api as sm
from statsmodels.formula.api import ols
from matplotlib import pyplot as plt
# In[]:
os.chdir(r"E:\Python_learning\data_science\task_0529\HW6")
house = pd.read_csv('sndHsPr.csv')
house.head()
# In[]:
dist = {'fengtai':'丰台区','haidian':'海淀区','chaoyang':'朝阳区','dongcheng':'东城区','xicheng':'西城区','shijingshan':'石景山区'}
house['dist1'] = house.dist.map(dist)
house.head()
# In[]:
#(一)单位面积房价分析
house['price'].describe()
# In[]:
sns.distplot(house.price,kde=True,fit=stats.norm)
fig1 = sm.qqplot(house.price,fit=True,line='45')
fig1.show()
# In[]:
house['price'].plot(kind='box')
#(二) 自变量分析:
# In[]:
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体
mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
# In[]:
#2.1自变量自身分布分析
data1 = ['dist1','roomnum','halls','floor','subway','school']
data2 = ['AREA']
for i in data1:
print('name:',i)
print(house[i].value_counts())
house[i].value_counts().plot(kind = 'bar')
plt.show()
for i in data2:
print('name:',i)
print(house[i].describe())
sns.distplot(house[i],kde=True,fit=stats.norm)
fig = sm.qqplot(house[i],fit=True,line='45')
fig.show()
# In[]:
#2.2自变量对因变量影响分析
data3=['dist1','roomnum','halls','floor']
data4=['subway','school']
data5=['AREA']
# 两样本T检验
for i in data4:
Suc0 = house[house[i] == 0]['price']
Suc1 = house[house[i] == 1]['price']
leveneTestRes = stats.levene(Suc0, Suc1, center='median')
print('w-value=%6.4f, p-value=%6.4f' %leveneTestRes)
print(stats.stats.ttest_ind(Suc0, Suc1, equal_var=False),'\n')
# In[]:
# 多分类方差分析
for i in data3:
print('name:',i)
print(sm.stats.anova_lm(ols('price ~ C(house[i])',data=house).fit()),'\n')
# In[]:
#相关性分析
for i in data5:
print('name:',i)
house.plot(x=i, y='price', kind='scatter')
plt.show()
print(house[[i, 'price']].corr(method='pearson'),"\n")
# (三)建立房价预测模型
# In[]:
#3.1 线性回归模型
'''forward select'''
def forward_select(data, response):
remaining = set(data.columns)
remaining.remove(response)
selected = []
current_score, best_new_score = float('inf'), float('inf')
while remaining:
aic_with_candidates=[]
for candidate in remaining:
formula = "{} ~ {}".format(
response,' + '.join(selected + [candidate]))
aic = ols(formula=formula, data=data).fit().aic
aic_with_candidates.append((aic, candidate))
aic_with_candidates.sort(reverse=True)
best_new_score, best_candidate=aic_with_candidates.pop()
if current_score > best_new_score:
remaining.remove(best_candidate)
selected.append(best_candidate)
current_score = best_new_score
print ('aic is {},continuing!'.format(current_score))
else:
print ('forward selection over!')
break
formula = "{} ~ {} ".format(response,' + '.join(selected))
print('final formula is {}'.format(formula))
model = ols(formula=formula, data=data).fit()
return(model)
# %%
#sm.stats.anova_lm(ols('price ~ C(dist1)+C(roomnum)+C(halls)+C(floor)+C(subway)+C(school)+C(AREA)',data=house).fit())
data_for_select = house[['price', 'dist1', 'roomnum', 'halls',
'floor','subway','school','AREA']]
lm_m = forward_select(data=data_for_select, response='price')
print(lm_m.rsquared)
# %%
#3.2 对因变量取对数的线性模型
data_for_select_lg = data_for_select.copy()
data_for_select_lg['price_lg'] = data_for_select_lg['price'].apply(lambda x: np.log(x))
data_for_select_lg.drop(['price'],axis=1,inplace=True)
lm_m_lg = forward_select(data=data_for_select_lg, response='price_lg')
print(lm_m_lg.rsquared)
# In[]:
# 3.3 考虑交互项的对数线性
data_for_select_lg['AREA_lg']=data_for_select_lg[['AREA']].apply(lambda x: np.log(x))
data_for_select_lg.drop(['AREA'],axis=1,inplace=True)
lm_m_lg = forward_select(data=data_for_select_lg, response='price_lg')
print(lm_m_lg.rsquared)
#%%
#(四)预测: 假设有一家三口,父母为了能让孩子在东城区上学,想买一套邻近地铁的两居室,面积是70平方米,中层楼层,那么房价大约是多少呢?
lst=[{"dist1":"东城区","roomnum":2,"halls":0,"floor":"middle",
"subway":1,"school":1,"AREA_lg":np.log(70),"price_lg":0}]
df = pd.DataFrame(lst)
lm_m_lg.predict(df)
一、因变量分析:单位面积房价分析
count 16210.000000
mean 61151.810919
std 22293.358147
min 18348.000000
25% 42812.250000
50% 57473.000000
75% 76099.750000
max 149871.000000
Name: price, dtype: float64
二、自变量分析
2.1 自变量自身分布分析
name: dist1
丰台区 2947
海淀区 2919
朝阳区 2864
东城区 2783
西城区 2750
石景山区 1947
Name: dist1, dtype: int64
name: roomnum
2 7971
3 4250
1 3212
4 675
5 102
Name: roomnum, dtype: int64
name: halls
1 11082
2 4231
0 812
3 85
Name: halls, dtype: int64
name: floor
middle 5580
high 5552
low 5078
Name: floor, dtype: int64
name: subway
1 13419
0 2791
Name: subway, dtype: int64
name: school
0 11297
1 4913
Name: school, dtype: int64
name: AREA
count 16210.000000
mean 91.746598
std 44.000768
min 30.060000
25% 60.000000
50% 78.830000
75% 110.517500
max 299.000000
Name: AREA, dtype: float64
2.2 自变量对因变量影响分析
name: subway
w-value=328.1249, p-value=0.0000
Ttest_indResult(statistic=-38.41845699514431, pvalue=3.207202424810418e-281)
name: school
w-value=58.3647, p-value=0.0000
Ttest_indResult(statistic=-77.29591891711223, pvalue=0.0)
name: dist1
df sum_sq mean_sq F PR(>F)
C(house[i]) 5.0 4.215655e+12 8.431310e+11 3557.727873 0.0
Residual 16204.0 3.840118e+12 2.369858e+08 NaN NaN
name: roomnum
df sum_sq mean_sq F PR(>F)
C(house[i]) 4.0 6.703058e+09 1.675764e+09 3.373776 0.009113
Residual 16205.0 8.049070e+12 4.967029e+08 NaN NaN
name: halls
df sum_sq mean_sq F PR(>F)
C(house[i]) 3.0 2.917226e+10 9.724088e+09 19.63329 1.063322e-12
Residual 16206.0 8.026601e+12 4.952857e+08 NaN NaN
name: floor
df sum_sq mean_sq F PR(>F)
C(house[i]) 2.0 1.999790e+10 9.998948e+09 20.166437 1.789401e-09
Residual 16207.0 8.035775e+12 4.958212e+08 NaN NaN
name: AREA
AREA price
AREA 1.000000 -0.073955
price -0.073955 1.000000
三、建立房价模型
3.1 线性回归模型
aic is 358593.69581283897,continuing!
aic is 356727.2565217991,continuing!
aic is 356251.24413644033,continuing!
aic is 356169.91982609546,continuing!
aic is 356005.8440516004,continuing!
aic is 355957.1383683952,continuing!
aic is 355929.32364125375,continuing!
final formula is price ~ dist1 + school + subway + halls + AREA + floor + roomnum
rsquared: 0.5959091332283823
3.2 对因变量取对数的线性模型
aic is 407.21691478716093,continuing!
aic is -1169.6133484040984,continuing!
aic is -1838.5011115886227,continuing!
aic is -1972.8738713787243,continuing!
aic is -2156.0212286889,continuing!
aic is -2214.6759667986043,continuing!
aic is -2230.826095094548,continuing!
final formula is price_lg ~ dist1 + school + subway + halls + AREA + floor + roomnum
rsquared: 0.6154245082337468
3.3 考虑交互项的对数线性
aic is 407.21691478716093,continuing!
aic is -1169.6133484040984,continuing!
aic is -1838.5011115886227,continuing!
aic is -1972.8738713787243,continuing!
aic is -2247.2296682713277,continuing!
aic is -2308.0787091857637,continuing!
aic is -2358.420282359053,continuing!
final formula is price_lg ~ dist1 + school + subway + halls + AREA_lg + floor + roomnum
rsquared: 0.6184397446086782
四、预测:假设有一家三口,父母为了能让孩子在东城区上学,想买一套邻近地铁的两居室,面积是70平方米,中层楼层,预测房价。
0 11.169971
dtype: float64
本站文章版权归原作者及原出处所有 。内容为作者个人观点, 并不代表本站赞同其观点和对其真实性负责。本站是一个个人学习交流的平台,并不用于任何商业目的,如果有任何问题,请及时联系我们,我们将根据著作权人的要求,立即更正或者删除有关内容。本站拥有对此声明的最终解释权。