# -*- coding: utf-8 -*-
"""
Created on Sun Jun 10 20:34:56 2018
@author: mokki
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import os
os.chdir(r'C:\Users\lijin\Desktop\Ben数据分析课程\提交-第四讲:统计建模与分析报告-二手房价格分析报告\作业')
house_price=pd.read_csv('sndHsPr.csv')
#%%
#因变量分析:单位面积房价分析
house_price.price.agg(['mean','median','std','skew'])
house_price.price.hist(bins=50)
sns.boxplot('price',data=house_price)
#%%
#自变量自身分布分析
house_price.dist.value_counts().plot(kind='bar')
house_price.dist.value_counts().plot(kind='pie')
house_price.roommum.value_counts().plot(kind='bar')
house_price.halls.value_counts().plot(kind='bar')
house_price.floor.value_counts().plot(kind='bar')
house_price.subway.value_counts().plot(kind='bar')
house_price.school.value_counts().plot(kind='bar')
#%%
house_price.AREA.agg(['mean','median','std','skew'])
house_price.AREA.hist(bins=50)
sns.boxplot('AREA',data=house_price)
#%%
#自变量对因变量影响分析
#方差分析
#dist对price影响分析
sns.boxplot(x='dist',y='price',data=house_price)
import statsmodels.api as sm
from statsmodels.formula.api import ols
sm.stats.anova_lm(ols('price~C(dist)',data=house_price).fit())
#roomnum对price影响分析
sns.boxplot(x='roomnum',y='price',data=house_price)
sm.stats.anova_lm(ols('price~C(roomnum)',data=house_price).fit())
#halls对price影响分析
sns.boxplot(x='halls',y='price',data=house_price)
sm.stats.anova_lm(ols('price~C(halls)',data=house_price).fit())
#floor对price影响分析
sns.boxplot(x='floor',y='price',data=house_price)
sm.stats.anova_lm(ols('price~C(floor)',data=house_price).fit())
#subway对price影响分析
sns.boxplot(x='subway',y='price',data=house_price)
sm.stats.anova_lm(ols('price~C(subway)',data=house_price).fit())
#school对price影响分析
sns.boxplot(x='school',y='price',data=house_price)
sm.stats.anova_lm(ols('price~C(school)',data=house_price).fit())
#相关分析
#AREA对price影响分析:1、散点图 2、对y取对数 3、计算相关系数
house_price.plot(x='AREA',y='price',kind='scatter')
house_price['price_ln']=np.log(house_price['price'])
house_price.plot(x='AREA',y='price_ln',kind='scatter')
house_price[['price_ln','AREA']].corr(method='pearson')
#%%
#多元线性回归
district={'fengtai':1,'haidian':2,'chaoyang':3,'dongcheng':4,'xicheng':5,'shijingshan':6}
house_price['district']=house_price.dist.map(district)
Floor={'low':1,'middle':2,'high':3}
house_price['Floor']=house_price.floor.map(Floor)
house_model=ols('price~district+roomnum+halls+Floor+subway+school',data=house_price).fit()
house_model.summary()
#对因变量取对数的多元线性回归
house_model1=ols('price_ln~district+roomnum+halls+Floor+subway+school',data=house_price).fit()
house_model1.summary()
#去除不显著的自变量
house_model2=ols('price_ln~district+halls+Floor+subway+school',data=house_price).fit()
house_model2.summary()
#%%
#考虑交互项的对数线性:暂等作业讲解
#预测:不知怎么用代码实现,暂等作业讲解