This notebook generates the results of the second study in the paper
Forking paths in empirical studies available on SSRN.
The dataset for this part comes from Dacheng Xiu’s website (https://dachxiu.chicagobooth.edu/) and was slightly
altered. It is a bit heavy and can be downloaded here.
WARNING: he code for this part takes much longer,
roughly 11 hours on a 2018 MacBook Pro.
The code is also folded to ease readability. Click on
the corresponding buttons to access it.
First, we load packages & import the data.
And take a brief look.
options(future.globals.maxSize= 891289600*10)
rstudioapi::writeRStudioPreference("data_viewer_max_columns", 200L) # Allows to view up to 200 columns
## NULL
library(tidyverse) # Data wrangling package
library(readxl) # Read excel files
library(data.table) # Fast wrangling!
library(purrr) # Functional programming
library(fst) # Fast reading & writing of dataframes
library(dtplyr) # Combining dplyr & data.table
library(lubridate) # Date management
library(ggallin) # Addin to ggplot for negative log scales
library(viridis) # Cool color palette
library(patchwork) # To layout graphs
library(latex2exp) # LaTeX in graphs
library(tictoc) # CPU timing
library(furrr) # Parallel computing
library(lmtest) # HAC sd
library(sandwich) # HAC sd
data_paths <- read_fst("data/data_paths.fst") # Loading main dataset
specs <- read_excel("data/ML_supp.xlsx") # Loading additional info
data_paths[,1:9] |> head(10) |> knitr::kable() # A snapshot
permno | Date | mvel1 | beta | betasq | chmom | dolvol | idiovol | indmom |
---|---|---|---|---|---|---|---|---|
10000 | 1986-02-28 | 16100.000 | NA | NA | NA | NA | NA | 0.2111586 |
10000 | 1986-03-31 | 11960.000 | NA | NA | NA | NA | NA | 0.2624714 |
10000 | 1986-04-30 | 16330.000 | NA | NA | NA | 7.897668 | NA | 0.3269873 |
10000 | 1986-05-30 | 15172.000 | NA | NA | NA | 8.472954 | NA | 0.3479495 |
10000 | 1986-06-30 | 11793.859 | NA | NA | NA | 8.250098 | NA | 0.3572053 |
10000 | 1986-07-31 | 11734.594 | NA | NA | NA | 8.113567 | NA | 0.4227055 |
10000 | 1986-08-29 | 10786.344 | NA | NA | NA | 8.103863 | NA | 0.3973749 |
10000 | 1986-09-30 | 4148.594 | NA | NA | NA | 8.103882 | NA | 0.3032582 |
10000 | 1986-10-31 | 3911.531 | NA | NA | NA | 8.112181 | NA | 0.3860786 |
10000 | 1986-11-28 | 3002.344 | NA | NA | NA | 8.205756 | NA | 0.2561681 |
norm_unif <- function(v){ # This is a function that uniformalises a vector
v <- v %>% as.matrix()
return(ecdf(v)(v) -0.5)
}
The original dataset has strong variations in the number of
well-defined fields before 1983.
Thus, we truncate the sample before this date.
Before that, we also remove a few predictors not suited for the sorting
study (e.g., binary-valued).
data_paths <- data_paths |> filter(Date > "1983-01-01") # Stability filter
data_paths <- data_paths |> arrange(Date, permno) # Order
dates <- data_paths$Date |> unique() |> sort() # Vector of dates
features <- specs |> pull(Acronym) # Predictors / firm characteristics
features <- setdiff(features, c("age", "convind", "divi", "divo", "ms", "nincr", "ps", "rd", "rd_sale", "realestate", "securedind", "sin", "dy"))
data_paths <- data_paths |>
select(all_of(c("permno", "Date", "Return", features))) |>
ungroup()
Empirical work in the paper is modelled as chains of modules.
module_variable <- function(data, variable){
#save(variable, file = paste0(Sys.time(), "_variable.RData"))
if(!(variable %in% c("mvel1", "retvol"))){
return(data |>
dplyr::select(all_of(c("permno", "Date", "Return", "mvel1", "retvol", variable))) |>
purrr::set_names(c("permno", "Date", "Return", "mvel1", "retvol", "variable")))
}
if(variable == "mvel1"){
return(data |>
dplyr::select(all_of(c("permno", "Date", "Return", "mvel1", "retvol"))) |>
mutate(variable = mvel1))
}
if(variable == "retvol"){
return(data |>
dplyr::select(all_of(c("permno", "Date", "Return", "mvel1", "retvol"))) |>
mutate(variable = retvol))
}
}
impute <- function(v){
v[is.na(v)] <- median(v, na.rm = T)
return(v)
}
module_imputation <- function(data, imputation){
if(imputation == 0){return(data)}
if(imputation == 1){
return(
data |>
group_by(Date) |>
mutate(variable = impute(variable)) |>
ungroup()
)
}
}
module_horizon <- function(data, horizon){
vol_thresh <- 0.001 # hard-coded!
if(horizon == 1){return(data |> na.omit() |> filter(retvol > vol_thresh))}
if(horizon == 2){
return(data |>
group_by(permno) |>
mutate(Return = (Return + dplyr::lead(Return)) / horizon) |> # Returns are scaled to the monthly scale
ungroup() |>
na.omit() |>
filter(retvol > vol_thresh)
)
}
if(horizon == 3){
return(data |>
group_by(permno) |>
mutate(Return = (Return + dplyr::lead(Return) + dplyr::lead(Return, 2)) / horizon) |>
ungroup() |>
na.omit() |>
filter(retvol > vol_thresh)
)
}
}
module_start <- function(data, start_quantile){
L <- nrow(data)
data[ceiling(L*start_quantile):L, ]
}
module_stop <- function(data, stop_quantile){
L <- nrow(data)
data[1:ceiling(L*stop_quantile), ]
}
module_thresh_type <- function(data, thresh_type){
type <- substr(thresh_type,(nchar(thresh_type)-1),nchar(thresh_type)) # Last 2 letters
thresh <- substr(thresh_type, 1, (nchar(thresh_type)-3)) |> as.numeric()
if(type == "EW"){
z <- data |>
na.omit() |>
group_by(Date) |>
mutate(
tm = quantile(variable, thresh, na.rm = T),
tp = quantile(variable, 1-thresh, na.rm = T)
)
z$type <- (z$variable <= z$tm)
z$type <- z$type - (z$variable >= z$tp)
z <- z[z$type != 0, ]
z$weight <- z$type # CHECK !!!!!!
z <- z |>
pivot_wider(names_from = "type", values_from = "weight") |>
group_by(Date) |>
mutate(long = if_else(is.na(`1`), 0, `1`),
long = long / sum(long),
short = if_else(is.na(`-1`), 0, `-1`),
short = short / sum(abs(short)),
position = long + short) |>
ungroup() |>
mutate(scale = length(unique(Date))/sum(position^2),
position2 = position * scale)
fit <- lm(Return ~ position2 - 1, data = z)
return(data.frame(m = coef(fit),
sd = summary(fit)$coefficients[, 2],
sd_hac = coeftest(fit, vcov. = NeweyWest(fit))[1,2],
vol = z |> group_by(Date) |> summarise(r = sum(Return*position)) |> pull(r) |> sd(),
n = nrow(z),
n_dates = length(unique(z$Date)),
aic = AIC(fit)))
}
if(type == "VW"){
z <- data |>
filter(is.finite(mvel1), mvel1 > 0) |> # CHECK !!!!
na.omit() |>
group_by(Date) |>
mutate(
tm = quantile(variable, thresh, na.rm = T),
tp = quantile(variable, 1-thresh, na.rm = T)
)
z$type <- (z$variable <= z$tm)
z$type <- z$type - (z$variable >= z$tp)
z <- z[z$type != 0, ]
z$weight <- z$type*z$mvel1 # CHECK !!!!!!
z <- z |>
pivot_wider(names_from = "type", values_from = "weight") |>
group_by(Date) |>
mutate(long = if_else(is.na(`1`), 0, `1`),
long = long / sum(long),
short = if_else(is.na(`-1`), 0, `-1`),
short = short / sum(abs(short)),
position = long + short) |>
ungroup() |>
mutate(scale = length(unique(Date))/sum(position^2),
position2 = position * scale)
fit <- lm(Return ~ position2 - 1, data = z)
return(data.frame(m = coef(fit),
sd = summary(fit)$coefficients[, 2],
sd_hac = coeftest(fit, vcov. = NeweyWest(fit))[1,2],
vol = z |> group_by(Date) |> summarise(r = sum(Return*position)) |> pull(r) |> sd(),
n = nrow(z),
n_dates = length(unique(z$Date)),
aic = AIC(fit)))
}
if(type == "IW"){
z <- data |>
filter(is.finite(retvol), retvol > 0) |> # CHECK !!!!
na.omit() |>
group_by(Date) |>
mutate(
tm = quantile(variable, thresh, na.rm = T),
tp = quantile(variable, 1-thresh, na.rm = T)
)
z$type <- (z$variable <= z$tm)
z$type <- z$type - (z$variable >= z$tp)
z <- z[z$type != 0, ]
z$weight <- z$type/z$retvol # CHECK !!!!!!
z <- z |>
pivot_wider(names_from = "type", values_from = "weight") |>
group_by(Date) |>
mutate(long = if_else(is.na(`1`), 0, `1`),
long = long / sum(long),
short = if_else(is.na(`-1`), 0, `-1`),
short = short / sum(abs(short)),
position = long + short) |>
ungroup() |>
mutate(scale = length(unique(Date))/sum(position^2),
position2 = position * scale)
fit <- lm(Return ~ position2 - 1, data = z)
return(data.frame(m = coef(fit),
sd = summary(fit)$coefficients[, 2],
sd_hac = coeftest(fit, vcov. = NeweyWest(fit))[1,2],
vol = z |> group_by(Date) |> summarise(r = sum(Return*position)) |> pull(r) |> sd(),
n = nrow(z),
n_dates = length(unique(z$Date)),
aic = AIC(fit)))
}
if(type == "CW"){
z <- data |>
na.omit() |>
group_by(Date) |>
mutate(variable = norm_unif(variable),
sign = sign(variable + rnorm(1)/10^6)) |>
group_by(Date, sign) |>
mutate(position = variable / sum(abs(variable)),
scale = length(unique(Date))/sum(position^2),
position2 = position * scale)
fit <- lm(Return ~ position2 - 1, data = z)
return(data.frame(m = coef(fit),
sd = summary(fit)$coefficients[, 2],
sd_hac = coeftest(fit, vcov. = NeweyWest(fit))[1,2],
vol = z |> group_by(Date) |> summarise(r = sum(Return*position)) |> pull(r) |> sd(),
n = nrow(z),
n_dates = length(unique(z$Date)),
aic = AIC(fit)))
}
}
A test below. There are 7 outputs: the mean return, the standard deviation of the estimated average return (IID and HAC), the vol (standard deviation of returns), the sample size, the number of dates and the AIC of the regression.
# A first test.
data_paths |>
module_variable("absacc") |>
module_imputation(0) |>
module_horizon(1) |>
module_start(1/3) |>
module_stop(2/3) |>
module_