碧沼
V1
2022/10/21阅读:68主题:默认主题
python时间序列
本文使用的第三方库如下:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
from sklearn import metrics
from pmdarima import auto_arima
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
import pymysql
import warnings
warnings.filterwarnings('ignore')
数据获取的方式通过调用本地mysql数据得到600519贵州茅台历史价格数据。
cursor = pymysql.connect(host='127.0.0.1',user='root',password='root',db='tysql').cursor()
sql = "select datetime,open,close,high,low,Volume from day_k_data WHERE stockcode='600519.XSHG'"
cursor.execute(sql)
data = pd.DataFrame(cursor.fetchall(),columns=['datetime','open','close','high','low','Volume'])
data.head(10)


时间序列的平稳性检验与模型评价
模型评价的函数如下:
def timeseries_evaluation_fun(y_true,y_pred):
def mean_absolute_percentage_error(y_true,y_pred):
y_true,y_pred = np.array(y_true),np.array(y_pred)
return np.mean(np.abs((y_true-y_pred)/y_true))*100
print('Evakuation metric results:')
print(f'MSE is :{metrics.mean_squared_error(y_true,y_pred)}')
print(f'MAE is :{metrics.mean_absolute_error(y_true,y_pred)}')
print(f'RMSE is :{np.sqrt(metrics.mean_squared_error(y_true,y_pred))}')
print(f'MAPE is :{mean_absolute_percentage_error(y_true,y_pred)}')
print(f"R2 is :{metrics.r2_score(y_true,y_pred)}",end='\n\n')
单位根ADF检验函数如下:
def adf_test_func(series,column_name):
print(f'Results of DF test for column: {column_name}')
dftest = adfuller(series,autolag='AIC')
dfoutput = pd.Series(dftest[0:4],index=['Test Statistic','p-value','No Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Values(%s)'%key] = value
print(dfoutput)
if dftest[1]<=0.05:
print('Conclusion: ----->')
print('Reject the null hypothesis')
print('Data is stationary')
else:
print('Conclusion: ----->')
print('Fail to reject the null hypothesis')
print('Data is non-stationary')

从结果可以看出600519收盘价不是平稳序列。
ARIMA


ARIMA是最经典的时间序列模型形式,以上两个表展现了不同参数下的模型数学形式。
ARIMA在python的实现
X = data.close
train,test = X[0:-30],X[-30:]
stepwise_model = auto_arima(train,start_p=1, start_q=1,
max_p=7, max_q=7, seasonal=False,
d=None, trace=True,error_action='ignore',suppress_warnings=True,
stepwise=True)
stepwise_model.summary()

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15,7]
plt.plot(train,label='trian')
plt.plot(test,label='test')
plt.plot(forecast,label='Predicted')
plt.plot(df_conf['Upper_bound'] , label ='Confidence Interval Upper bound')
plt.plot(df_conf['Lower_bound'] , label = 'Confidence Interval Lower bound')
plt.legend(loc='best')
plt.show()
stepwise_model.plot_diagnostics()

ARIMA的预测很难跟踪金融时间序列尤其是低阶的ARIMA模型,从图中可以发现模型的预测误差较大。
SARIMA
季节ARIMA是ARIMA的拓展形式,为季节分量创建了AR、I、MA三个超参数,表现出季节性。简单理解就是在ARIMA时间序列模型的基础上引入了季节项,
例子:
-
非季节成分MA(1) :
-
季节成分MA(1):
SARIMA在Python的实现
import warnings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
from pmdarima import auto_arima
from sklearn import metrics
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
for m in [1,4,7,12,52]:
print("="*100)
print(f'Fitting SARIMA for Seasonal value m = {str(m)}')
stepwise_model = auto_arima(train, start_p=1, start_q=1,
max_p=7, max_q=7, seasonal=True, start_P=1,
start_Q=1, max_P=7, max_D=7, max_Q=7, m=m,
d=None, D=None, trace=True, error_action='ignore', suppress_warnings=True,
stepwise=True)
print(f'Model summary for m = {str(m)}')
print("-"*100)
stepwise_model.summary()
forecast,conf_int = stepwise_model.predict(n_periods=30,return_conf_int=True)
df_conf = pd.DataFrame(conf_int,columns=['Upper_bound','Lower_bound'])
df_conf['new_index'] = range(1675,1705)
df_conf = df_conf.set_index('new_index')
forecast = pd.DataFrame(forecast,columns=['close_pred'])
forecast['new_index'] = range(1675,1705)
forecast = forecast.set_index('new_index')
timeseries_evaluation_fun(test,forecast)
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize']=[15,7]
plt.plot(train,label='train')
plt.plot(test,label='test')
plt.plot(forecast,label = f'Predicted with m={str(m)}')
plt.plot(df_conf['Upper_bound'],label='Confidence Interval Upper')
plt.plot(df_conf['Lower_bound'],label='Confidence Interval Lower')
plt.legend(loc='best')
print("-"*100)
stepwise_model.plot_diagnostics()
print("-"*100)

SARIMA与ARIMA在股价序列的预测效果近似。存在较大的误差、
SARIMAX
SARIMAX是一个具有外部影响变量的SARIMA模型,X为外生变量的向量。可以表示为
同样可以理解为是在SARIMA模型的基础上引入了外生冲击,例如要预测股票收盘价,外生变量可以是股票的开盘价。
SARIMAX在Python的实现
for name,column in data[['close','open','high','low']].iteritems():
adf_test_func(data[name],name)
X = data.close
actualtrain,actualtest = X[0:-30],X[-30:]
exoX = data.open
exotrain,exotest = exoX[0:-30],exoX[-30:]
for m in [1,4,7,12,52]:
print("="*100)
print(f'Fitting SARIMAX for Seasonal value m = {str(m)}')
stepwise_model = auto_arima(actualtrain,X = pd.DataFrame(exotrain), start_p=1, start_q=1,
max_p=7, max_q=7, seasonal=True, start_P=1,
start_Q=1, max_P=7, max_D=7, max_Q=7, m=m,
d=None, D=None, trace=True, error_action='ignore', suppress_warnings=True,
stepwise=True)
print(f'Model summary for m = {str(m)}')
print("-"*100)
stepwise_model.summary()
forecast,conf_int = stepwise_model.predict(n_periods=30,X = pd.DataFrame(exotest),return_conf_int=True)
df_conf = pd.DataFrame(conf_int,columns=['Upper_bound','Lower_bound'])
df_conf['new_index'] = range(1675,1705)
df_conf = df_conf.set_index('new_index')
forecast = pd.DataFrame(forecast,columns=['close_pred'])
forecast['new_index'] = range(1675,1705)
forecast = forecast.set_index('new_index')
timeseries_evaluation_fun(actualtest,forecast)
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize']=[15,7]
plt.figure()
plt.plot(actualtrain,label='train')
plt.plot(actualtest,label='test')
plt.plot(forecast,label = f'Predicted with m={str(m)}')
plt.plot(df_conf['Upper_bound'],label='Confidence Interval Upper')
plt.plot(df_conf['Lower_bound'],label='Confidence Interval Lower')
plt.legend(loc='best')
print("-"*100)
stepwise_model.plot_diagnostics()
print("-"*100)

可见引入当日开盘价作为外生变量的SARIMAX模型可以跟得上价格序列的演变了,拟合程度相比于ARIMA、SARIMA有较大的提高。
作者介绍
碧沼
V1