-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathforecast_GDP_midas.py
186 lines (152 loc) · 7.91 KB
/
forecast_GDP_midas.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# ======================================= Compute OoS Performance Metrics for Forecasts produced by MIDAS ==================================================== #
# ============================================================================================================================================== #
import pandas as pd
from pandas.core.common import flatten
import numpy as np
import os
import sys
import math
from varname import nameof
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels import robust
from statsmodels.tsa.api import VAR
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import datetime
import time
import psutil
import multiprocessing as multi
from itertools import product, repeat
from functools import partial
# from dask.distributed import Client, LocalCluster
# import joblib
path_midas = "e:\\Copy\\SCRIPTS\\Forecast_Covid_Recovery\\Code\\midas\\"
sys.path.append(path_midas)
from mix import mix_freq, mix_freq2
from adl import estimate, forecast, midas_adl, rmse, estimate2,forecast2, midas_adl2
os.environ['NUMEXPR_MAX_THREADS'] = '30'
##### Set the current working directory
path="e:\\Copy\\SCRIPTS\\Forecast_Covid_Recovery\\Code\\"
os.chdir(path)
##### parse dates and times
def date_parser(date):
dt = datetime.datetime.strptime(date, '%Y:%m:%d')
return dt.strftime('%Y-%m-%d')
##### Calculate rolling-window OoS R^2 for tau-step-ahead forecasts based on a univariate MIDAS model
def OoS_R_sq_midas(sstart:int, y_df: pd.DataFrame, x_df: pd.DataFrame, wsize: int, T1: int, xlag: str, ylag: int, tau: int, poly = 'expalmon', method='rolling') -> tuple:
"""
INPUT:
y_df and x_df: pandas dataframes with dates in the index column for both y and x;
'wsize': size of rolling windows; 'sstart': sub-sample starting point; 'T1': size of each sub-sample;
'xlag': number of lags for the high-frequency variable; 'ylag': number of autoregressive lags for the dependant low-frequency variable;
'tau': how much the high-frequency data is lagged before frequency mixing; 'poly': a MIDAS polynomial; 'method': a forecasting method ('rolling' or 'recursive')
OUTPUT:
a tuple of performance metrics:
"""
assert (T1 > wsize+tau), "size of a subsample must be much greater than the window size!"
assert ( sstart+T1 <= len(y_df) ), "end point of the last subsample must be less than or equal to size of the dataframe!"
y_df = y_df.iloc[sstart:(sstart+T1), :].copy()
x_df = x_df.copy()
start_date = y_df.index[3]
end_date = y_df.index[3+wsize]
# call 'midas_adl' function
rmse_in, rmse, fc = midas_adl(y_df.iloc[:, 0], x_df.iloc[:, 0], start_date=start_date, end_date=end_date, xlag=xlag, ylag=ylag, horizon=tau, poly=poly, method=method)
# compute rolling- window mean forecasts
n_month = y_df.index.get_loc(end_date) - y_df.index.get_loc(start_date)
y_df['benchmark'] = y_df.iloc[:, 0].rolling(window=n_month).mean()
# merge 'y_df' to 'fc'
fc = fc.merge(y_df, how = 'left', left_index=True, right_index=True)
# calculate mad, mae, rmse, r^2
fc.reset_index(inplace = True)
OoS_size = len(fc)
# print(fc)
rmse = 0
var = 0
mae = 0
list_err = []
for s in np.arange(OoS_size):
r_forecast = fc.loc[s, 'preds']
r = fc.loc[s, 'targets'] # actual returns
# print(fc.loc[s, 'index'], r, r_forecast)
rmse += pow(r - r_forecast, 2) / OoS_size
var += pow(r - fc.loc[s, 'benchmark'], 2) / OoS_size
mae += abs(r - r_forecast) / OoS_size
list_err.append(r - r_forecast)
err = np.array(list_err)
mad = robust.mad(err, c = 1)
del y_df, x_df
return sstart, rmse_in, float(mad), float(mae), math.sqrt(float(rmse) ), float(1 - rmse/(var + 0.00001) )
##### Define a wrapper to parallel compute values of the OoS R^2
def OoS_R_sq_midas_wrapper(y_df, x_df, x_df_name: str, wsize: int, T1: int, xlag: str, ylag: int, tau: int, poly = 'expalmon', method='rolling', processes=-1):
assert(tau > 0), "the forecast horizon must be greater than zero!"
# decide how many proccesses will be created
ssize = len(y_df) - ylag
if processes <=0:
num_cpus = psutil.cpu_count(logical=False)
else:
num_cpus = processes
start = time.time()
# create a list of all feasible rolling-window starting points
step = 1
sstarts, ssample_start_dates, ssample_end_dates = [], [], []
for i in np.arange(0, ssize+step, step):
if i+T1 <= ssize:
sstarts.append(i)
ssample_start_dates.append( y_df.index[i] )
ssample_end_dates.append( y_df.index[i+T1-1] )
# start processes in the pool
print( 'Start multiprocessing . . .' )
with multi.Pool(processes = num_cpus) as process_pool:
# perfs = process_pool.starmap( process_OoS_R_sq, zip(sstarts, repeat(use_model) ) )
perfs = process_pool.starmap( OoS_R_sq_midas, zip(sstarts, repeat(y_df), repeat(x_df), repeat(wsize), repeat(T1), repeat(xlag), \
repeat(ylag), repeat(tau), repeat(poly), repeat(method) ) )
process_pool.close()
# process_pool.terminate()
process_pool.join()
print( 'Multiprocessing done!' )
# put all numbers into a dataframe
data = pd.DataFrame( perfs, columns = ['sstart', 'rmse_in', 'mad', 'mae', 'rmse', 'r_sq'] )
data.insert(loc=1, column='ssample_start_date', value=ssample_start_dates)
data.insert(loc=2, column='ssample_end_date', value=ssample_end_dates)
data.to_csv('../Data/FRED/perf_out_{df}_model_midas_sampsize_{ssize}_subsize_{T1}_wsize_{wsize}_fhorizon_{tau}_y'\
'lag_{ylag}.csv'.format(df=x_df_name, ssize=ssize, T1=T1, wsize=wsize, tau=tau, ylag=ylag), index = False, header = True)
end = time.time()
print( 'Completed in: %s sec'%(end - start) )
return data
if __name__ == '__main__':
# multi.freeze_support()
# multi.set_start_method("spawn")
start_time = time.time() # start the timer
# Import GDP and ADS data (from 9/1/1959 to 12/1/2020) and plot
fields_GDP = ['date', 'GDP']
US_GDP_df = pd.read_csv('../Data\\FRED\\quarterly_transformed_pca.csv', engine = 'python', encoding='utf-8', skipinitialspace=True, usecols = fields_GDP, sep = ',', \
parse_dates = ['date'], index_col = 'date')
US_GDP_df.dropna(inplace=True)
US_GDP_df = US_GDP_df.copy().resample('Q').last() # convert beginning-of-the-month dates to end-of-the-month dates
fields_ADS = ['date', 'ADS_Index']
US_ADS_df = pd.read_csv('../Data\\ADS_Index_Most_Current_Vintage.csv', engine = 'python', encoding='utf-8', skipinitialspace=True, usecols = fields_ADS, sep = ',')
US_ADS_df.date = pd.to_datetime( US_ADS_df.date.astype(str).apply(date_parser) )# parse dates
US_ADS_df.dropna(inplace=True)
US_ADS_df.set_index('date', inplace=True)
US_ADS_df_monthly = US_ADS_df.copy().resample('M').mean() # convert a time series to the monthly frequency by sampling then taking mean
US_ADS_df_monthly = US_ADS_df_monthly.copy().shift(3)
US_ADS_df_monthly.dropna(inplace=True)
US_GDP_df.plot(subplots=True, figsize=(10, 8), grid=True, rot=10, sharex=False, sharey=False)
plt.savefig('../Data\\US_gdp_big.pdf', dpi=400)
US_ADS_df_monthly.plot(subplots=True, figsize=(10, 8), grid=True, rot=10, sharex=False, sharey=False)
plt.savefig('../Data\\US_ADS.pdf', dpi=400)
##### Set parameters
wsize = 60 # set a rolling-window size
xlag = '3M' # set number of lags for the high-frequency variable to match with data of the low-frequency variable
ylag = 1 # set a maximum autoregressive lag for the dependent variable
T1 = 100 # set a sub-sample size
processes = 60 # set a number of processes to be launched
tau_step = 1
taus = np.arange(1, 6+tau_step, tau_step) # create a list of forecast horizons
for tau in taus:
OoS_R_sq_midas_wrapper(US_GDP_df, US_ADS_df_monthly, nameof(US_ADS_df), wsize, T1, xlag, ylag, tau, \
poly = 'expalmon', method='rolling', processes=processes)
print( 'Completed in: %s sec'%(time.time() - start_time) )