This notebook presents the R code used for the paper “Characteristics-driven
returns in equilibrium”.
Only the strict minimum of material is provided.
The dataset, with anonymized firm identifiers, is
available here.
The R Markdown notebook that can be run directly in RStudio is
accessible here.
Some results towards the end were generated with additional
notebooks, hence they require external output.
The non-baseline notebooks (neural networks, macro-data, sparse models)
are available upo request.
First, we load packages & import the data.
rstudioapi::writeRStudioPreference("data_viewer_max_columns", 200L) # Allows to view up to 200 columns
## NULL
library(tidyverse) # Data wrangling package
library(readxl) # Read excel files
library(data.table) # Fast wrangling!
library(fst) # Fast reading & writing of dataframes
library(dtplyr) # Combining dplyr & data.table
library(lubridate) # Date management
library(ggallin) # Addin to ggplot for negative log scales
library(viridis) # Cool color palette
library(patchwork) # To layout graphs
library(latex2exp) # LaTeX in graphs
library(RcppEigen) # For fast linear models
library(reshape2) # List wrangling
library(Matrix) # High speed matrix manipulation
library(fastDummies) # For fixed effects
data_demand <- read_fst("data/data_demand.fst") # Loading main dataset
#data_demand <- read_fst("data/demand_macro.fst") # Loading main dataset
specs <- read_excel("data/ML_supp.xlsx") # Loading additional info
na_0 <- function(x){ # Function that deals with missing points
x[is.na(x)] <- 0 # Values are in [-0.5,0.5], NAs => 0
return(x)
}
data_demand <- data_demand %>% # The above function is applied to the data
mutate(across(3:ncol(data_demand), na_0))
We create useful variables, especially predictor names used in models
later on.
The baseline estimates are based on the characteristics that are
released monthly (there are 20 of them).
The original dataset has strong variations in the number of well-defined
fields before 1983.
Thus, we truncate the sample before this date.
data_demand <- data_demand %>% filter(Date > "1984-06-01") # Stability filter
data_demand <- data_demand %>% arrange(Date, permno) # Order
dates <- data_demand$Date %>% unique() %>% sort() # Vector of dates
features <- specs %>% # Predictors / firm characteristics
# filter(Frequency == "Monthly") %>%
pull(Acronym)
#features <- c("mvel1", "bm", "mom12m") # Characteristics used in panel models
D_features <- paste0("D_", features) # Change in predictors
varz <- c(features, D_features) # List of all independent variables
#varz <- colnames(data_demand)[4:ncol(data_demand)]
Below, we create the objects in which we will store some
results/outputs.
Also, we fix the sample size used throughout the notebook!
5 years means 60 monthly points. If there are roughly 5,000 firms in one
sample, this makes 300k observations for one estimation.
We recall that there are 20 characteristics + 20 changes thereof, which
makes 40 predictors.
sample <- 24 # Sample size (in months)
frequency <- 12 # How often we estimate => every X months
dates_oos <- dates[sample:length(dates)] # Out-of-sample dates
res_pool <- list() # Initialize results
res_fixed <- list() # Initialize results
R2_pool <- 0
R2_fixed <- 0
cor_series <- array(0, dim = c((length(dates_oos)-1), # Initialize corr matrices, nb dates
length(varz) + 1,
length(varz) + 1))
cov_series <- array(0, dim = c((length(dates_oos)-1), # Initialize cov matrices, nb dates
length(varz) + 1,
length(varz) + 1))
f_e <- c() # Initialize fixed effects
avg_ret_fac <- matrix(0, nrow = length(dates_oos)-1, # Initialize average returns
ncol = 3) # We focus on 3 factors
avg_fe_fac <- matrix(0, nrow = length(dates_oos)-1, # Initialize average fixed effects
ncol = 3 )
We rapidly source the custom functions we use below.
source("func.R")
The loop below perform 4 tasks:
1. perform the pooled estimations and store the corresponding estimates
& metrics (e.g., \(R^2\)).
2. compute the correlations and covariances between the characteristics
and their changes.
3. estimate the model with fixed effects & store the values.
4. compute the average return and average fixed effect of sorted
portfolios on the rolling samples.
The loop is on the dates.
We could make do without a loop via the pmap()
function.
But the complexity of the output is a bit deterring (this would generate
a complex list).
se_fe <- F
j <- 1
dates_used <- c()
avg_ret <- c()
features_sort <- c("mvel1", "bm", "mom12m")
tictoc::tic()
for(i in 1:(length(dates_oos)-2)){
if(i %% 60 == 1){print(dates_oos[i])} # Keeps track of progress
data_train <- data_prep(data_demand, dates[i], dates_oos[i]) # Sample at time t
data_test <- data_prep(data_demand, dates_oos[i+1], dates_oos[i+1]) # Test set: t+1
data_test_0 <- data_train %>% filter(Date == dates_oos[i]) # Test set: t
avg_ret <- bind_rows(avg_ret, # This stores the average returns in the sample
data_train |> group_by(permno) |>
summarise(avg_ret = mean(Return, na.rm = T)) |>
mutate(date = dates_oos[i]))
y <- data_train$Return
x <- data_train %>% select(all_of(varz)) %>% data.matrix()
if(i %% frequency == 1){
if(se_fe == T){
d <- cbind(x, dummy_cols(data_train$permno)[,-1]) %>% data.matrix() %>% Matrix(sparse = T) # For fixed effects
d <- t(d) %*% d
nn <- ncol(x)
NN <- ncol(d)
d_inv <- solve(d)
d_iag <- diag(d_inv)[(nn+1):NN]
}
if(se_fe == F){ d_iag <- NA}
dates_used <- c(dates_used, as.character(dates_oos[i]))
# The code below is for the pooled OLS estimations
permnos <- data_train$permno %>% unique() %>% sort() # Keep model names
pool_temp <- fit_lin(x = x, y = y, data_train$permno, permnos, as.character(dates_oos[i]), d_iag, "pooled")
res_pool[[j]] <- pool_temp %>% mutate(sample = sample) # Storing estimation results
}
test_set <- intersect(data_test$permno, data_test_0$permno) %>% # Common stocks for test
intersect(permnos)
pred_temp <- pred_lin(res = pool_temp, # OOS prediction
x_new = data_test_0 %>%
filter(permno %in% test_set) %>%
select(all_of(varz)),
permno = test_set,
type = "pooled")
target <- data_test %>% filter(permno %in% test_set) %>% pull(Return) # OOS realization
error <- pred_temp - target
R2_pool[i] <- 1 - var(error) / var(target)
## Next we compute the correlations & covariances
## cor_series[i,,] <- cor(data_train %>% select(-permno, -Date), use = "pairwise.complete.obs") # Store the correlation matrix
## cov_series[i,,] <- cov(data_train %>% select(-permno, -Date), use = "pairwise.complete.obs") # Store the covariance matrix
if(i %% frequency == 1){
# The code below pertains to the panel model
fixed_temp <- fit_lin(x = x, y = y, data_train$permno, permnos, as.character(dates_oos[i]), d_iag, "fixed") # Within estimation
res_fixed[[j]] <- fixed_temp %>% mutate(sample = sample)
# Finally, the average returns & average fixed-effects of the sorted portfolios
for(k in 1:length(features_sort)){
avg_ret_fac[i,k] <- sorted_metric(data_train, features_sort[k], "Return") # Store the factor average return
avg_fe_fac[i,k] <- sorted_metric(data = fixed_temp |> filter(field == "fe") |> # Store the factor fixed effects
# bind_cols(f_e = fixef(fit_fe) %>% as.numeric(),
# permno = unique(data_train$permno) %>% sort()) %>%
right_join(data_train |> select(permno, Date, mvel1, mom12m, bm),
by = "permno"),
var = features_sort[k],
metric = "value")
}
j <- j + 1
}
pred_temp <- pred_lin(res = fixed_temp, # OOS prediction
x_new = data_test_0 %>%
filter(permno %in% test_set) %>%
select(all_of(varz)),
permnos = test_set,
type = "fixed")
error <- pred_temp - target # OOS error
R2_fixed[i] <- 1 - var(error) / var(target) # OOS R2
}
## [1] "1986-05-30"
## [1] "1991-05-31"
## [1] "1996-05-31"
## [1] "2001-05-31"
## [1] "2006-05-31"
## [1] "2011-05-31"
## [1] "2016-05-31"
## [1] "2021-05-28"
tictoc::toc()
## 758.912 sec elapsed
save(avg_ret, file = paste0("avg_ret_", sample, ".RData"))
save(avg_ret_fac, file = paste0("avg_ret_fac_", sample, ".RData"))
save(avg_fe_fac, file = paste0("avg_fe_fac_", sample, ".RData"))
Below, we wrangle the output, for estimation outputs (estimates, t-stats, p-values) and \(R^2\) separately.
res_pool <- bind_rows(res_pool)
res_fixed <- bind_rows(res_fixed)
Saving important results below for future use.
model <- "linear"
spec <- "simple"
res_full <-
bind_rows(
res_pool %>% mutate(type = "pooled",
model = model,
spec = spec,
date = as.Date(date)),
res_fixed %>% mutate(type = "fixed",
model = model,
spec = spec,
date = as.Date(date)),
data.frame(value = R2_pool,
factor = NA,
field = "R2_oos",
date = dates_oos[1:(length(dates_oos)-2)],
permno = NA,
sample = sample,
type = "pooled",
model = model,
spec = spec),
data.frame(value = R2_fixed,
factor = NA,
field = "R2_oos",
date = dates_oos[1:(length(dates_oos)-2)],
permno = NA,
sample = sample,
type = "fixed",
model = model,
spec = spec)
)
save(res_full, file = paste0("res_full_linear_",sample,".RData"))
g1 <- res_full %>%
filter(field == "t_stat",
factor %in% c("mom12m", "mvel1", "bm"),
type == "pooled") %>%
ggplot(aes(x = date, y = value, color = factor)) + geom_line() + theme_bw() +
geom_hline(yintercept = 2.5, linetype = 2) + # Upper 1% threshold
geom_hline(yintercept = -2.5, linetype = 2) + # Lower 1% threshold
theme(legend.position = c(0.75, 1.2), # Legend position
text = element_text(size = 12), # Overall text size
plot.title = element_text(face = "bold"), # Bold title
legend.title = element_text(face = "bold", size = 9), # Title font size for the legend
plot.subtitle = element_text(size = 10, face = "italic"), # Italic subtitle
axis.title.x = element_blank(), # No x-axis title (obvious)
axis.title.y = element_blank(), # No y-axis title (=title)
panel.grid.major = element_line(size = 0.3, linetype = 'solid', colour = "#EEEEEE"), # Grid adjustment
panel.grid.minor = element_line(size = 0.1, linetype = 'solid', colour = "#FFFFFF")) + # Grid adjustment
geom_line(alpha = 0.6, size = 0.6) + labs(subtitle = "Pooled estimation t-statistic") + # Lines + subtitle
scale_color_manual(values = c("#D5280C", "#FC9203", "#219B96"),
labels = c("value", "mom", "size"),
name = "Characteristic") +
ggtitle(TeX("Estimates for $\\textbf{c}_t^{(k)}$ (2 year samples)")) +
guides(colour = guide_legend(nrow = 1))
g2 <- res_full %>%
filter(field == "t_stat",
factor %in% c("mom12m", "mvel1", "bm"),
type == "fixed") %>%
ggplot(aes(x = date, y = value, color = factor)) + geom_line() + theme_bw() +
geom_hline(yintercept = 2.5, linetype = 2) + # Upper 1% threshold
geom_hline(yintercept = -2.5, linetype = 2) + # Lower 1% threshold
theme(legend.position = c(0.75, 1.2), # Legend position
text = element_text(size = 12), # Overall text size
plot.title = element_text(face = "bold"), # Bold title
legend.title = element_text(face = "bold", size = 9), # Title font size for the legend
plot.subtitle = element_text(size = 10, face = "italic"), # Italic subtitle
axis.title.x = element_blank(), # No x-axis title (obvious)
axis.title.y = element_blank(), # No y-axis title (=title)
panel.grid.major = element_line(size = 0.3, linetype = 'solid', colour = "#EEEEEE"), # Grid adjustment
panel.grid.minor = element_line(size = 0.1, linetype = 'solid', colour = "#FFFFFF")) + # Grid adjustment
geom_line(alpha = 0.6, size = 0.6) +
labs(subtitle = "Within estimation t-statistic (with fixed effects)") + # Lines + subtitle
scale_color_manual(values = c("#D5280C", "#FC9203", "#219B96"),
labels = c("value", "mom", "size"),
name = "Characteristic") +
ggtitle(TeX("Estimates for $\\textbf{c}_t^{(k)}$ (2 year samples)")) +
guides(colour = guide_legend(nrow = 1))
g1/g2