Packages & parameters

We start by loading the required packages & functions.

rm(list = ls())
library(tidyverse)
library(Rcpp)
library(mvtnorm)
library(reshape2)
library(latex2exp)
library(cowplot)
sourceCpp("c_functions.cpp")
source("R_functions.R")

Then, we set some default parameter values. Note: because T is reserved in R, we use TT instead.

k   <- 12
TT  <- 36          # TT = number of observations
a_x <- 0           # Constant in the AR process
a_y <- 0           # Constant in the AR process
r_x <- 0.9         # rho_x
r_y <- 0.7         # rho_y
s_x <- 0.2         # sigma_x
s_y <- 0.4         # sigma_y
cor <- 0           # Correlation between innovations e_y and e_x
n_sim <- 10^5      # Number of simulations 
n_points <- 10^3   # Number of discretization points in the approximation of the integral
STOP <- 10^3       # Point where the discretization stops

Testing the functions

All functions were coded both in R and C. The C versions were only faster for the autoregressive simulations. However, the matrix multiplications in the large dimensional simulations made the C (or C++ in Armadillo) code slower. Below, we test the following:

In the first function, processes are allowed to have a drift, but this drift does not impact the MSE, so we set it to zero.

We use tictoc to compute running times.

tictoc::tic()
quad_err_C_2d2(a_x, a_y, r_x, r_y, s_x, s_y, cor, TT, k, n_sim)
[1] 0.4236085 0.3981790
tictoc::toc()
2.775 sec elapsed
tictoc::tic()
MSE_simulation(a_x, a_y, r_x, r_y, s_x, s_y, cor, TT, k, n_sim)
[1] 0.4211911 0.3793136
tictoc::toc()
15.314 sec elapsed
tictoc::tic()
MSE_integral_ind(r_x, r_y, s_y, TT, k, n_points, STOP)
[1] 0.4217557
tictoc::toc()
0.197 sec elapsed
tictoc::tic()
MSE_integral(r_x, r_y, s_y, cor, TT, k, n_points, STOP)
[1] 0.4217697
tictoc::toc()
5.622 sec elapsed

Even with 100,000 simulations, the values are already close.

First study

The first step is to evaluate the accuracy of the methods and quantify how long they take.
The C functions were not designed to take vectors parameter values, thus we resort to ugly loops, though functional programming would be more elegant (see the sensitivity analysis below). One important reason why a loop is convenient is that it’s easy to reset the (pseudo-)random number generator.
This is not costly computationally because the number of iterations is small.
Nevertheless, the loop below is quite time-consuming (a few hours on a 2019 MacBook Pro).

n <- 1000*2^(0:12)
stop <- 100 * round(log(n))
val_AR_sim <- 0
val_S_sim <- 0
val_integral_ind <- 0
var_AR_sim <- 0
var_S_sim <- 0
cpu_AR_sim <- 0
cpu_S_sim <- 0
cpu_integral_ind <- 0
for(i in 1:length(n)){
    set.seed(42)
    tictoc::tic()
    tmp <- quad_err_C_2d2(a_x, a_y, r_x, r_y, s_x, s_y, cor, TT, k, n[i])
    val_AR_sim[i] <- tmp[1]
    var_AR_sim[i] <- tmp[2]
    z <- tictoc::toc(quiet = T)
    cpu_AR_sim[i] <- z$toc - z$tic
    
    set.seed(42)
    tictoc::tic()
    tmp <- MSE_simulation(a_x, a_y, r_x, r_y, s_x, s_y, cor, TT, k, n[i])
    val_S_sim[i] <- tmp[1]
    var_S_sim[i] <- tmp[2]
    z <- tictoc::toc(quiet = T)
    cpu_S_sim[i] <- z$toc - z$tic
    
    set.seed(42)
    tictoc::tic()
    val_integral_ind[i] <- MSE_integral_ind(r_x, r_y, s_y, TT, k, n[i], STOP = stop[i])
    z <- tictoc::toc(quiet = T)
    cpu_integral_ind[i] <- z$toc - z$tic
}

val <- data.frame(n, val_AR_sim, val_S_sim, val_integral_ind) %>%
    pivot_longer(-n, names_to = "type", values_to = "value") %>%
    mutate(indicator = "MSE",
           type = substr(type, 5, 9))
save(val, file="values.RData")

cpu <- data.frame(n, cpu_AR_sim, cpu_S_sim, cpu_integral_ind) %>%
    pivot_longer(-n, names_to = "type", values_to = "value") %>%
    mutate(indicator = "CPU")
save(cpu, file = "cpu.RData")

CI <- data.frame(n, var_AR_sim, var_S_sim) %>%
    pivot_longer(-n, names_to = "type", values_to = "value") %>%
    mutate(indicator = "CI",
           type = substr(type, 5, 9),
           value = 1.96 * sqrt(value/n)) # Width of confidence interval
save(var, file = "CI.RData")

Then, we wrangle the results and plot them.
We start with the raw MSE values.

data_g <- bind_rows(val, CI) %>%
    pivot_wider(names_from = "indicator", values_from = "value") %>%
    mutate(lower = MSE - CI, upper = MSE + CI) %>%
    pivot_longer(3:6, names_to = "indicator", values_to = "value")
plot_MSE_n <- data_g %>%
    ggplot(aes(x = n/1000, y = value, color = type)) + 
    geom_line(data = data_g %>% filter(indicator == "MSE")) + 
    geom_line(data = data_g %>% filter(indicator == "lower"), linetype = 2) +
    geom_line(data = data_g %>% filter(indicator == "upper"), linetype = 3) +
    xlab(TeX("\\textbf{Points (in thousands)}")) + ylab(TeX("\\textbf{Mean Squared Error}")) +
    #scale_color_viridis(option="inferno", begin = 0.4, end = 0.9, discrete = TRUE)
    scale_color_manual(name = "Type", 
                       labels = c("Process", "Integral", "Multivariate"), 
                       values = c("#0B56B0", "#FD6909", "#12A74C")) +
    scale_x_log10() +
    theme(aspect.ratio = 1.2, legend.position = c(0.8, 0.2))
plot_MSE_n

We see that Monte-Carlo simulations, as expected, yield values that oscillate a lot. The speed of convergence is proportional to \(n^{-1/2}\), so in our case, the error has an order of magnitude of \(10^{-3}\), which is what we observe.

Next, we turn to computation times.

plot_CPU_n <- cpu %>%
    ggplot(aes(x = n/1000, y = value, color = type)) + geom_line() +
    xlab(TeX("\\textbf{Points (in thousands)}")) + ylab(TeX("\\textbf{CPU time (seconds) - logscale}")) +
    scale_y_log10() +
    scale_color_manual(name = "Type", 
                       labels = c("Process", "Integral", "Multivariate"), 
                       values = c("#0B56B0", "#FD6909", "#12A74C")) +
    theme(aspect.ratio = 1.2, legend.position = c(0.8, 0.2))
plot_CPU_n

There is almost a 10-fold different between each color!

Below, in order to save time, we discard the slowest method.

Save both plots together

plot_grid(plot_MSE_n, plot_CPU_n, ncol = 2, nrow = 1)
Removed 13 row(s) containing missing values (geom_path).Removed 13 row(s) containing missing values (geom_path).

#ggsave(paste(folder, "comparison.pdf", sep='/'))

Impact of T

Next, we switch to other values of T. It’s interesting to see what happens when the sample size augments.
Again, we resort to a loop.

TTT <- seq(5,100,5)
value_AR_sim <- 0
value_integral_ind <- 0
cpu_AR_sim <- 0
cpu_integral_ind <- 0
for(i in 1:length(TTT)){
    tictoc::tic()
    value_AR_sim[i] <- quad_err_C_2d2(a_x, a_y, r_x, r_y, s_x, s_y, cor, TTT[i], k, n_sim)[1]
    z <- tictoc::toc(quiet = T)
    cpu_AR_sim[i] <- z$toc - z$tic

    tictoc::tic()
    value_integral_ind[i] <- MSE_integral_ind(r_x, r_y, s_y, TTT[i], k, n_points, 100 * round(log(n_points)) )
    z <- tictoc::toc(quiet = T)
    cpu_integral_ind[i] <- z$toc - z$tic
}

values_TT <- data.frame(TTT, value_AR_sim, value_integral_ind) %>%
    pivot_longer(-TTT, names_to = "type", values_to = "value")
save(values_TT, file = "values_TT.RData")

cpu_TT <- data.frame(TTT, cpu_AR_sim, cpu_integral_ind) %>%
    pivot_longer(-TTT, names_to = "type", values_to = "value")
save(cpu_TT, file = "cpu_TT.RData")

Then we analyze the results.

plot_MSE_T <- values_TT %>%
    ggplot(aes(x = TTT, y = value, color = type)) + geom_line() + 
    xlab(TeX("\\textbf{T}")) + ylab(TeX("\\textbf{Mean Squared Error}")) +
    #scale_color_viridis(option="inferno", begin = 0.4, end = 0.9, discrete = TRUE)
    scale_color_manual(name = "Type", 
                       labels = c("Process", "Integral"), 
                       values = c("#0B56B0", "#FD6909")) +
    theme(aspect.ratio = 1.2, legend.position = c(0.8, 0.6))
plot_MSE_T

When T increases, the MSE follows. The range of \(y\) values makes it hard to discern the difference between the two curves.

plot_CPU_T <- cpu_TT %>%
    ggplot(aes(x = TTT, y = value, color = type)) + geom_line() +
    xlab(TeX("\\textbf{T}")) + ylab(TeX("\\textbf{CPU time (seconds) - logscale}")) +
    scale_y_log10() +
    scale_color_manual(name = "Type", 
                       labels = c("Process", "Integral"), 
                       values = c("#0B56B0", "#FD6909")) +
    theme(aspect.ratio = 1.2, legend.position = c(0.8, 0.6))
plot_CPU_T

Overall, the discretized integral is more than 100 times faster that the MC simulations.

ggsave("comparison_09.pdf")
Saving 7 x 7 in image
---
title: "Predictive regressions: comparing methods"
output: html_notebook
---

# Packages & parameters

We start by loading the required packages & functions.

```{r, message = F, warning = F, size = "footnotesize"}
rm(list = ls())
library(tidyverse)
library(Rcpp)
library(mvtnorm)
library(reshape2)
library(latex2exp)
library(cowplot)
sourceCpp("c_functions.cpp")
source("R_functions.R")
```

Then, we set some **default parameter** values. **Note**: because T is reserved in R, we use TT instead.

```{r, size = "footnotesize"}
k   <- 12
TT  <- 36          # TT = number of observations
a_x <- 0           # Constant in the AR process
a_y <- 0           # Constant in the AR process
r_x <- 0.9         # rho_x
r_y <- 0.7         # rho_y
s_x <- 0.2         # sigma_x
s_y <- 0.4         # sigma_y
cor <- 0           # Correlation between innovations e_y and e_x
n_sim <- 10^5      # Number of simulations 
n_points <- 10^3   # Number of discretization points in the approximation of the integral
STOP <- 10^3       # Point where the discretization stops
```


# Testing the functions

All functions were coded both in **R** and **C**. The C versions were only faster for the autoregressive simulations.
However, the matrix multiplications in the large dimensional simulations made the C (or C++ in Armadillo) code slower.
Below, we test the following:

- **Process**: relying on the function that simulates correlated bivariate AR(1) processes (in C): **quad_err_C_2d**    
- **Multivariate**: relying on the function that simulates vector z using the large covariance matrix (in R): **MSE_simulation**    
- **Integral**: relying on the function that uses numerical integration to compute the MSE: **MSE_integral**   

In the first function, processes are allowed to have a drift, but this drift does not impact the MSE, so we set it to zero. 

We use tictoc to compute running times.

```{r, size = "footnotesize"}
tictoc::tic()
quad_err_C_2d2(a_x, a_y, r_x, r_y, s_x, s_y, cor, TT, k, n_sim)
tictoc::toc()

tictoc::tic()
MSE_simulation(a_x, a_y, r_x, r_y, s_x, s_y, cor, TT, k, n_sim)
tictoc::toc()

tictoc::tic()
MSE_integral_ind(r_x, r_y, s_y, TT, k, n_points, STOP)
tictoc::toc()

tictoc::tic()
MSE_integral(r_x, r_y, s_y, cor, TT, k, n_points, STOP)
tictoc::toc()
```

Even with 100,000 simulations, the values are already close.

# First study

The first step is to evaluate the **accuracy** of the methods and quantify how long they take.   
The C functions were not designed to take vectors parameter values, thus we resort to ugly loops, though functional programming would be more elegant (see the sensitivity analysis below). One important reason why a loop is convenient is that it's easy to reset the (pseudo-)random number generator.   
This is not costly computationally because the number of iterations is small.   
Nevertheless, the loop below is quite **time-consuming** (a few hours on a 2019 MacBook Pro).

```{r, message = F, warning = F, size = "footnotesize"}
n <- 1000*2^(0:12)
stop <- 100 * round(log(n))
val_AR_sim <- 0
val_S_sim <- 0
val_integral_ind <- 0
var_AR_sim <- 0
var_S_sim <- 0
cpu_AR_sim <- 0
cpu_S_sim <- 0
cpu_integral_ind <- 0
for(i in 1:length(n)){
    set.seed(42)
    tictoc::tic()
    tmp <- quad_err_C_2d2(a_x, a_y, r_x, r_y, s_x, s_y, cor, TT, k, n[i])
    val_AR_sim[i] <- tmp[1]
    var_AR_sim[i] <- tmp[2]
    z <- tictoc::toc(quiet = T)
    cpu_AR_sim[i] <- z$toc - z$tic
    
    set.seed(42)
    tictoc::tic()
    tmp <- MSE_simulation(a_x, a_y, r_x, r_y, s_x, s_y, cor, TT, k, n[i])
    val_S_sim[i] <- tmp[1]
    var_S_sim[i] <- tmp[2]
    z <- tictoc::toc(quiet = T)
    cpu_S_sim[i] <- z$toc - z$tic
    
    set.seed(42)
    tictoc::tic()
    val_integral_ind[i] <- MSE_integral_ind(r_x, r_y, s_y, TT, k, n[i], STOP = stop[i])
    z <- tictoc::toc(quiet = T)
    cpu_integral_ind[i] <- z$toc - z$tic
}

val <- data.frame(n, val_AR_sim, val_S_sim, val_integral_ind) %>%
    pivot_longer(-n, names_to = "type", values_to = "value") %>%
    mutate(indicator = "MSE",
           type = substr(type, 5, 9))
save(val, file="values.RData")

cpu <- data.frame(n, cpu_AR_sim, cpu_S_sim, cpu_integral_ind) %>%
    pivot_longer(-n, names_to = "type", values_to = "value") %>%
    mutate(indicator = "CPU")
save(cpu, file = "cpu.RData")

CI <- data.frame(n, var_AR_sim, var_S_sim) %>%
    pivot_longer(-n, names_to = "type", values_to = "value") %>%
    mutate(indicator = "CI",
           type = substr(type, 5, 9),
           value = 1.96 * sqrt(value/n)) # Width of confidence interval
save(var, file = "CI.RData")
```

Then, we wrangle the results and plot them.   
We start with the **raw MSE** values.

```{r, size = "footnotesize"}
data_g <- bind_rows(val, CI) %>%
    pivot_wider(names_from = "indicator", values_from = "value") %>%
    mutate(lower = MSE - CI, upper = MSE + CI) %>%
    pivot_longer(3:6, names_to = "indicator", values_to = "value")
plot_MSE_n <- data_g %>%
    ggplot(aes(x = n/1000, y = value, color = type)) + 
    geom_line(data = data_g %>% filter(indicator == "MSE")) + 
    geom_line(data = data_g %>% filter(indicator == "lower"), linetype = 2) +
    geom_line(data = data_g %>% filter(indicator == "upper"), linetype = 3) +
    xlab(TeX("\\textbf{Points (in thousands)}")) + ylab(TeX("\\textbf{Mean Squared Error}")) +
    #scale_color_viridis(option="inferno", begin = 0.4, end = 0.9, discrete = TRUE)
    scale_color_manual(name = "Type", 
                       labels = c("Process", "Integral", "Multivariate"), 
                       values = c("#0B56B0", "#FD6909", "#12A74C")) +
    scale_x_log10() +
    theme(aspect.ratio = 1.2, legend.position = c(0.8, 0.2))
plot_MSE_n
```

We see that **Monte-Carlo** simulations, as expected, yield values that oscillate a lot. The speed of convergence is proportional to $n^{-1/2}$, so in our case, the error has an order of magnitude of $10^{-3}$, which is what we observe.

Next, we turn to **computation times**.

```{r, size = "footnotesize"}
plot_CPU_n <- cpu %>%
    ggplot(aes(x = n/1000, y = value, color = type)) + geom_line() +
    xlab(TeX("\\textbf{Points (in thousands)}")) + ylab(TeX("\\textbf{CPU time (seconds) - logscale}")) +
    scale_y_log10() +
    scale_color_manual(name = "Type", 
                       labels = c("Process", "Integral", "Multivariate"), 
                       values = c("#0B56B0", "#FD6909", "#12A74C")) +
    theme(aspect.ratio = 1.2, legend.position = c(0.8, 0.2))
plot_CPU_n
```

There is almost a 10-fold different between each color!   

Below, in order to save time, we discard the slowest method.

# Save both plots together

```{r, size = "footnotesize"}
plot_grid(plot_MSE_n, plot_CPU_n, ncol = 2, nrow = 1)
#ggsave(paste(folder, "comparison.pdf", sep='/'))
```

# Impact of T

Next, we switch to other values of T. It's interesting to see what happens when the sample size augments.   
Again, we resort to a loop.

```{r, size = "footnotesize"}
TTT <- seq(5,100,5)
value_AR_sim <- 0
value_integral_ind <- 0
cpu_AR_sim <- 0
cpu_integral_ind <- 0
for(i in 1:length(TTT)){
    tictoc::tic()
    value_AR_sim[i] <- quad_err_C_2d2(a_x, a_y, r_x, r_y, s_x, s_y, cor, TTT[i], k, n_sim)[1]
    z <- tictoc::toc(quiet = T)
    cpu_AR_sim[i] <- z$toc - z$tic

    tictoc::tic()
    value_integral_ind[i] <- MSE_integral_ind(r_x, r_y, s_y, TTT[i], k, n_points, 100 * round(log(n_points)) )
    z <- tictoc::toc(quiet = T)
    cpu_integral_ind[i] <- z$toc - z$tic
}

values_TT <- data.frame(TTT, value_AR_sim, value_integral_ind) %>%
    pivot_longer(-TTT, names_to = "type", values_to = "value")
save(values_TT, file = "values_TT.RData")

cpu_TT <- data.frame(TTT, cpu_AR_sim, cpu_integral_ind) %>%
    pivot_longer(-TTT, names_to = "type", values_to = "value")
save(cpu_TT, file = "cpu_TT.RData")
```

Then we analyze the results.

```{r, size = "footnotesize"}
plot_MSE_T <- values_TT %>%
    ggplot(aes(x = TTT, y = value, color = type)) + geom_line() + 
    xlab(TeX("\\textbf{T}")) + ylab(TeX("\\textbf{Mean Squared Error}")) +
    #scale_color_viridis(option="inferno", begin = 0.4, end = 0.9, discrete = TRUE)
    scale_color_manual(name = "Type", 
                       labels = c("Process", "Integral"), 
                       values = c("#0B56B0", "#FD6909")) +
    theme(aspect.ratio = 1.2, legend.position = c(0.8, 0.6))
plot_MSE_T
```

When T increases, the MSE follows. The range of $y$ values makes it hard to discern the difference between the two curves.


```{r, size = "footnotesize"}
plot_CPU_T <- cpu_TT %>%
    ggplot(aes(x = TTT, y = value, color = type)) + geom_line() +
    xlab(TeX("\\textbf{T}")) + ylab(TeX("\\textbf{CPU time (seconds) - logscale}")) +
    scale_y_log10() +
    scale_color_manual(name = "Type", 
                       labels = c("Process", "Integral"), 
                       values = c("#0B56B0", "#FD6909")) +
    theme(aspect.ratio = 1.2, legend.position = c(0.8, 0.6))
plot_CPU_T
```

Overall, the discretized integral is more than 100 times faster that the MC simulations.

```{r, size = "footnotesize"}
plot_grid(plot_MSE_n, plot_CPU_T, ncol = 2, nrow = 1)
#ggsave("comparison_09.pdf")
```
