1 Introduction

This notebook generates the results of the second study in the paper Forking paths in empirical studies available on SSRN.
The dataset for this part comes from Dacheng Xiu’s website (https://dachxiu.chicagobooth.edu/) and was slightly altered. It is a bit heavy and can be downloaded here.

WARNING: he code for this part takes much longer, roughly 11 hours on a 2018 MacBook Pro.
The code is also folded to ease readability. Click on the corresponding buttons to access it.

2 Setup

First, we load packages & import the data.
And take a brief look.

options(future.globals.maxSize= 891289600*10)
rstudioapi::writeRStudioPreference("data_viewer_max_columns", 200L)   # Allows to view up to 200 columns
## NULL
library(tidyverse)                                                    # Data wrangling package
library(readxl)                                                       # Read excel files
library(data.table)                                                   # Fast wrangling!
library(purrr)                                                        # Functional programming
library(fst)                                                          # Fast reading & writing of dataframes
library(dtplyr)                                                       # Combining dplyr & data.table
library(lubridate)                                                    # Date management
library(ggallin)                                                      # Addin to ggplot for negative log scales
library(viridis)                                                      # Cool color palette
library(patchwork)                                                    # To layout graphs
library(latex2exp)                                                    # LaTeX in graphs
library(tictoc)                                                       # CPU timing
library(furrr)                                                        # Parallel computing
library(lmtest)                                                       # HAC sd
library(sandwich)                                                     # HAC sd

data_paths <- read_fst("data/data_paths.fst")                         # Loading main dataset
specs <- read_excel("data/ML_supp.xlsx")                              # Loading additional info
data_paths[,1:9] |> head(10) |> knitr::kable()                       # A snapshot
permno Date mvel1 beta betasq chmom dolvol idiovol indmom
10000 1986-02-28 16100.000 NA NA NA NA NA 0.2111586
10000 1986-03-31 11960.000 NA NA NA NA NA 0.2624714
10000 1986-04-30 16330.000 NA NA NA 7.897668 NA 0.3269873
10000 1986-05-30 15172.000 NA NA NA 8.472954 NA 0.3479495
10000 1986-06-30 11793.859 NA NA NA 8.250098 NA 0.3572053
10000 1986-07-31 11734.594 NA NA NA 8.113567 NA 0.4227055
10000 1986-08-29 10786.344 NA NA NA 8.103863 NA 0.3973749
10000 1986-09-30 4148.594 NA NA NA 8.103882 NA 0.3032582
10000 1986-10-31 3911.531 NA NA NA 8.112181 NA 0.3860786
10000 1986-11-28 3002.344 NA NA NA 8.205756 NA 0.2561681
norm_unif <-  function(v){                   # This is a function that uniformalises a vector
    v <- v %>% as.matrix()
    return(ecdf(v)(v) -0.5)
}

The original dataset has strong variations in the number of well-defined fields before 1983.
Thus, we truncate the sample before this date.
Before that, we also remove a few predictors not suited for the sorting study (e.g., binary-valued).

data_paths <- data_paths |> filter(Date > "1983-01-01") # Stability filter
data_paths <- data_paths |> arrange(Date, permno)       # Order
dates <- data_paths$Date |> unique() |> sort()          # Vector of dates
features <- specs |> pull(Acronym)                        # Predictors / firm characteristics 
features <- setdiff(features, c("age", "convind", "divi", "divo", "ms", "nincr", "ps", "rd", "rd_sale", "realestate", "securedind", "sin", "dy"))
data_paths <- data_paths |>
    select(all_of(c("permno", "Date", "Return", features))) |> 
    ungroup()

3 The modules

Empirical work in the paper is modelled as chains of modules.

module_variable <- function(data, variable){
    #save(variable, file = paste0(Sys.time(), "_variable.RData"))
    if(!(variable %in% c("mvel1", "retvol"))){
        return(data |> 
                   dplyr::select(all_of(c("permno", "Date", "Return", "mvel1", "retvol", variable))) |>
                   purrr::set_names(c("permno", "Date", "Return", "mvel1", "retvol", "variable")))
    }
    if(variable == "mvel1"){
        return(data |> 
                   dplyr::select(all_of(c("permno", "Date", "Return", "mvel1", "retvol"))) |>
                   mutate(variable = mvel1))
    }
    if(variable == "retvol"){
        return(data |> 
                   dplyr::select(all_of(c("permno", "Date", "Return", "mvel1", "retvol"))) |>
                   mutate(variable = retvol))
    }    
}

impute <- function(v){
    v[is.na(v)] <- median(v, na.rm = T)
    return(v)
}

module_imputation <- function(data, imputation){
    if(imputation == 0){return(data)}
    if(imputation == 1){
        return(
            data |> 
                group_by(Date) |>
                mutate(variable = impute(variable)) |>
                ungroup()
        )
    }
}

module_horizon <- function(data, horizon){
    vol_thresh <- 0.001 # hard-coded!
    if(horizon == 1){return(data |> na.omit() |> filter(retvol > vol_thresh))}
    if(horizon == 2){
        return(data |>
                   group_by(permno) |>
                   mutate(Return = (Return + dplyr::lead(Return)) / horizon) |> # Returns are scaled to the monthly scale
                   ungroup() |>
                   na.omit() |> 
                   filter(retvol > vol_thresh)
        )
    }
    if(horizon == 3){
        return(data |>
                   group_by(permno) |>
                   mutate(Return = (Return + dplyr::lead(Return) + dplyr::lead(Return, 2)) / horizon) |>
                   ungroup() |>
                   na.omit() |> 
                   filter(retvol > vol_thresh) 
        )
    }
}

module_start <- function(data, start_quantile){
    L <- nrow(data)
    data[ceiling(L*start_quantile):L, ]
}

module_stop <- function(data, stop_quantile){
    L <- nrow(data)
    data[1:ceiling(L*stop_quantile), ]
}

module_thresh_type <- function(data, thresh_type){
    type <- substr(thresh_type,(nchar(thresh_type)-1),nchar(thresh_type))     # Last 2 letters 
    thresh <- substr(thresh_type, 1, (nchar(thresh_type)-3)) |> as.numeric()
    if(type == "EW"){
        z <- data |> 
            na.omit() |>
            group_by(Date) |>
            mutate(
                tm = quantile(variable, thresh, na.rm = T),
                tp = quantile(variable, 1-thresh, na.rm = T)
            )
        
        z$type <- (z$variable <= z$tm)
        z$type <- z$type - (z$variable >= z$tp)
        z <- z[z$type != 0, ]
        z$weight <- z$type                    # CHECK !!!!!!
        
        z <- z |> 
            pivot_wider(names_from = "type", values_from = "weight") |>
            group_by(Date) |>
            mutate(long = if_else(is.na(`1`), 0, `1`),
                   long = long / sum(long),
                   short = if_else(is.na(`-1`), 0, `-1`),
                   short = short / sum(abs(short)),
                   position = long + short) |>
            ungroup() |>
            mutate(scale = length(unique(Date))/sum(position^2),
                   position2 = position * scale)
        
        fit <- lm(Return ~ position2 - 1, data = z)        
        return(data.frame(m = coef(fit),
                          sd = summary(fit)$coefficients[, 2],
                          sd_hac = coeftest(fit, vcov. = NeweyWest(fit))[1,2],
                          vol = z |> group_by(Date) |> summarise(r = sum(Return*position)) |> pull(r) |> sd(),
                          n = nrow(z),
                          n_dates = length(unique(z$Date)),
                          aic = AIC(fit)))
    }
    if(type == "VW"){
        z <- data |> 
            filter(is.finite(mvel1), mvel1 > 0) |>  # CHECK !!!!
            na.omit() |>
            group_by(Date) |>
            mutate(
                tm = quantile(variable, thresh, na.rm = T),
                tp = quantile(variable, 1-thresh, na.rm = T)
            )
        
        z$type <- (z$variable <= z$tm)
        z$type <- z$type - (z$variable >= z$tp)
        z <- z[z$type != 0, ]
        z$weight <- z$type*z$mvel1                    # CHECK !!!!!!
        
        z <- z |> 
            pivot_wider(names_from = "type", values_from = "weight") |>
            group_by(Date) |>
            mutate(long = if_else(is.na(`1`), 0, `1`),
                   long = long / sum(long),
                   short = if_else(is.na(`-1`), 0, `-1`),
                   short = short / sum(abs(short)),
                   position = long + short) |>
            ungroup() |>
            mutate(scale = length(unique(Date))/sum(position^2),
                   position2 = position * scale)
        
        fit <- lm(Return ~ position2 - 1, data = z)        
        return(data.frame(m = coef(fit),
                          sd = summary(fit)$coefficients[, 2],
                          sd_hac = coeftest(fit, vcov. = NeweyWest(fit))[1,2],
                          vol = z |> group_by(Date) |> summarise(r = sum(Return*position)) |> pull(r) |> sd(),
                          n = nrow(z),
                          n_dates = length(unique(z$Date)),
                          aic = AIC(fit)))
    }
    if(type == "IW"){
        z <- data |> 
            filter(is.finite(retvol), retvol > 0) |>  # CHECK !!!!
            na.omit() |>
            group_by(Date) |>
            mutate(
                tm = quantile(variable, thresh, na.rm = T),
                tp = quantile(variable, 1-thresh, na.rm = T)
            )
        
        z$type <- (z$variable <= z$tm)
        z$type <- z$type - (z$variable >= z$tp)
        z <- z[z$type != 0, ]
        z$weight <- z$type/z$retvol                    # CHECK !!!!!!
        
        z <- z |> 
            pivot_wider(names_from = "type", values_from = "weight") |>
            group_by(Date) |>
            mutate(long = if_else(is.na(`1`), 0, `1`),
                   long = long / sum(long),
                   short = if_else(is.na(`-1`), 0, `-1`),
                   short = short / sum(abs(short)),
                   position = long + short) |>
            ungroup() |>
            mutate(scale = length(unique(Date))/sum(position^2),
                   position2 = position * scale)
        
        fit <- lm(Return ~ position2 - 1, data = z)        
        return(data.frame(m = coef(fit),
                          sd = summary(fit)$coefficients[, 2],
                          sd_hac = coeftest(fit, vcov. = NeweyWest(fit))[1,2],
                          vol = z |> group_by(Date) |> summarise(r = sum(Return*position)) |> pull(r) |> sd(),
                          n = nrow(z),
                          n_dates = length(unique(z$Date)),
                          aic = AIC(fit)))
    }
    if(type == "CW"){
        z <- data |> 
            na.omit() |>
            group_by(Date) |>
            mutate(variable = norm_unif(variable),
                   sign = sign(variable + rnorm(1)/10^6)) |>
            group_by(Date, sign) |>
            mutate(position = variable / sum(abs(variable)),
                   scale = length(unique(Date))/sum(position^2),
                   position2 = position * scale) 
        
        fit <- lm(Return ~ position2 - 1, data = z)        
        return(data.frame(m = coef(fit),
                          sd = summary(fit)$coefficients[, 2],
                          sd_hac = coeftest(fit, vcov. = NeweyWest(fit))[1,2],
                          vol = z |> group_by(Date) |> summarise(r = sum(Return*position)) |> pull(r) |> sd(),
                          n = nrow(z),
                          n_dates = length(unique(z$Date)),
                          aic = AIC(fit)))        
    }
}

A test below. There are 7 outputs: the mean return, the standard deviation of the estimated average return (IID and HAC), the vol (standard deviation of returns), the sample size, the number of dates and the AIC of the regression.

# A first test.
data_paths |>
    module_variable("absacc") |>
    module_imputation(0) |>
    module_horizon(1) |>
    module_start(1/3) |>
    module_stop(2/3) |>
    module_