python基础5 ARIMA

浏览: 1856
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import acf,pacf
from statsmodels.tsa.arima_model import ARIMA
import statsmodels.tsa.stattools as st
from statsmodels.tsa.arima_model import ARMA
import pandas as pd
import numpy as np
dir_path='E://Data//'
order_data = pd.read_excel(dir_path+'order_data1.xlsx',header=0,
names=['order_date','product_id','product_name','qty'],
dtype={'order_date':str,'product_id':str,'product_name':str,'qty':str},
encoding='utf-8')
order_data['order_date']=pd.to_datetime(order_data.order_date)
order_data.qty=order_data.qty.map(int)
data=order_data[['order_date','product_id','qty']].groupby(['order_date','product_id']).sum().reset_index()
data1=order_data[['order_date','qty']].groupby(['order_date']).sum()
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 6 #rcParams设定好画布的大小
#平稳性测试
def test_stationarity(timeseries):
# 决定起伏统计,以一个月为一个窗口,每一个时间t的值由它前面2个月(包括自己)的均值代替
rolmean = pd.rolling_mean(timeseries, window=12) # 对size个数据进行移动平均
rol_weighted_mean = pd.ewma(timeseries, span=12) # 对size个数据进行加权移动平均
rolstd = pd.rolling_std(timeseries, window=12) # 偏离原始值多少

# plot rolling statistics
orig = plt.plot(timeseries, color='blue', label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
weighted_mean = plt.plot(rol_weighted_mean, color='green', label='weighted Mean')
std = plt.plot(rolstd, color='black', label='Rolling Std')

plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)

# 进行df测试 Dickey-Fuller test
print('Result of Dickry-Fuller test')
dftest = adfuller(timeseries, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value', '#Lags Used', 'Number of observations Used'])
# dftest的输出前一项依次为检测值,p值,滞后数,使用的观测数,各个置信度下的临界值
for key, value in dftest[4].items():
dfoutput['Critical value(%s)' % key] = value
print(dfoutput)

ts = data1['qty']
#test_stationarity(ts)
#plt.show()
# 差分differencing
ts_diff = ts.diff(1)
ts_diff.dropna(inplace=True)
#test_stationarity(ts_diff)
#plt.show()

#二阶差分和一阶差分对比图
ts_diff1 = ts.diff(1)
ts_diff2 = ts.diff(2)
#ts_diff1.plot(color='red')
#ts_diff2.plot()
#plt.show()

#长期趋势和周期性的变动。
# estimating
#取对数
ts_log = np.log(ts)
# plt.plot(ts_log)
# plt.show()
moving_avg = pd.rolling_mean(ts_log, 12)
# plt.plot(moving_avg)
# plt.plot(moving_avg,color='red')
# plt.show()
ts_log_moving_avg_diff = ts_log - moving_avg
# print ts_log_moving_avg_diff.head(12)
ts_log_moving_avg_diff.dropna(inplace=True)
#test_stationarity(ts_log_moving_avg_diff)
#plt.show()

#作差 log后差分
ts_log_diff = ts_log.diff(1)
ts_log_diff.dropna(inplace=True)
#test_stationarity(ts_log_diff)
#plt.show()

#Differencing--差分
ts_log_diff = ts_log - ts_log.shift()
ts_log_diff.dropna(inplace=True)
#test_stationarity(ts_log_diff)

#log后二阶差分与一阶差分对比图
ts_log_diff1 = ts_log.diff(1)
ts_log_diff2 = ts_log.diff(2)
#ts_log_diff1.plot()
#ts_log_diff2.plot()
#plt.show()
#log后的数据更平稳一些,但一阶差分和二阶差分几乎没有差别,选择一阶差分

plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
#自相关图 --原数据
from statsmodels.graphics.tsaplots import plot_pacf,plot_acf
plot_acf(ts)
plt.title('自相关图 -- 原数据')
plt.show()

#自相关图 --log后差分数据
from statsmodels.graphics.tsaplots import plot_pacf,plot_acf
plot_acf(ts_log_moving_avg_diff)
plt.title('自相关图 -- 差分数据')
plt.show()

#从自相关图来看log后的一阶差分更稳定
#选择pq 方式一
lag_acf = acf(ts_diff, nlags=10)
lag_pacf = pacf(ts_diff, nlags=10, method='ols')
plt.subplot(121)
plt.plot(lag_acf)
plt.axhline(y=0, linestyle='--', color='gray')
plt.axhline(y=-1.96 / np.sqrt(len(ts_diff)), linestyle='--', color='gray') # lowwer置信区间
plt.axhline(y=1.96 / np.sqrt(len(ts_diff)), linestyle='--', color='gray') # upper置信区间
plt.title('Autocorrelation Function')

plt.subplot(122)
plt.plot(lag_pacf)
plt.axhline(y=0, linestyle='--', color='gray')
plt.axhline(y=-1.96 / np.sqrt(len(ts_diff)), linestyle='--', color='gray')
plt.axhline(y=1.96 / np.sqrt(len(ts_diff)), linestyle='--', color='gray')
plt.title('Partial Autocorrelation Function')
plt.tight_layout()
plt.show()

#选择pq 方式二
import statsmodels.tsa.stattools as st
order=st.arma_order_select_ic(ts_log_diff,max_ar=3,max_ma=3,ic=['aic','bic','hqic'])
order_min=order.bic_min_order
print(order_min)

# ARIMA
model = ARIMA(ts_log, order=(2, 1, 2))
result_ARIMA = model.fit(disp=-1)
plt.plot(ts_log_diff)
plt.plot(result_ARIMA.fittedvalues, color='red')
plt.title('ARIMA RSS:%.4f' % sum(result_ARIMA.fittedvalues - ts_log_moving_avg_diff) ** 2)
plt.show()

predictions_ARIMA_diff = pd.Series(result_ARIMA.fittedvalues, copy=True)
predictions_ARIMA_diff.head()#发现数据是没有第一行的,因为有1的延迟

predictions_ARIMA_diff_cumsum = predictions_ARIMA_diff.cumsum()
predictions_ARIMA_diff_cumsum.head()


predictions_ARIMA_log = pd.Series(ts_log.ix[0], index=ts_log.index)
predictions_ARIMA_log = predictions_ARIMA_log.add(predictions_ARIMA_diff_cumsum, fill_value=0)
predictions_ARIMA_log.head()
#ts.head()

predictions_ARIMA = np.exp(predictions_ARIMA_log)
plt.plot(ts)
plt.plot(predictions_ARIMA,color='y')
plt.title('predictions_ARIMA RMSE: %.4f' % np.sqrt(sum((predictions_ARIMA - ts) ** 2) / len(ts)))
plt.show()

推荐 0
本文由 liliwu 创作,采用 知识共享署名-相同方式共享 3.0 中国大陆许可协议 进行许可。
转载、引用前需联系作者,并署名作者且注明文章出处。
本站文章版权归原作者及原出处所有 。内容为作者个人观点, 并不代表本站赞同其观点和对其真实性负责。本站是一个个人学习交流的平台,并不用于任何商业目的,如果有任何问题,请及时联系我们,我们将根据著作权人的要求,立即更正或者删除有关内容。本站拥有对此声明的最终解释权。

0 个评论

要回复文章请先登录注册