1 Introduction

This notebook generates the results of the second study in the paper Forking paths in empirical studies available on SSRN.
The dataset for this part comes from Dacheng Xiu’s website (https://dachxiu.chicagobooth.edu/) and was slightly altered. It is a bit heavy and can be downloaded here.

WARNING: he code for this part takes much longer, roughly 11 hours on a 2018 MacBook Pro.
The code is also folded to ease readability. Click on the corresponding buttons to access it.

2 Setup

First, we load packages & import the data.
And take a brief look.

options(future.globals.maxSize= 891289600*10)
rstudioapi::writeRStudioPreference("data_viewer_max_columns", 200L)   # Allows to view up to 200 columns

## NULL

library(tidyverse)                                                    # Data wrangling package
library(readxl)                                                       # Read excel files
library(data.table)                                                   # Fast wrangling!
library(purrr)                                                        # Functional programming
library(fst)                                                          # Fast reading & writing of dataframes
library(dtplyr)                                                       # Combining dplyr & data.table
library(lubridate)                                                    # Date management
library(ggallin)                                                      # Addin to ggplot for negative log scales
library(viridis)                                                      # Cool color palette
library(patchwork)                                                    # To layout graphs
library(latex2exp)                                                    # LaTeX in graphs
library(tictoc)                                                       # CPU timing
library(furrr)                                                        # Parallel computing
library(lmtest)                                                       # HAC sd
library(sandwich)                                                     # HAC sd

data_paths <- read_fst("data/data_paths.fst")                         # Loading main dataset
specs <- read_excel("data/ML_supp.xlsx")                              # Loading additional info
data_paths[,1:9] |> head(10) |> knitr::kable()                       # A snapshot

permno	Date	mvel1	beta	betasq	chmom	dolvol	idiovol	indmom
10000	1986-02-28	16100.000	NA	NA	NA	NA	NA	0.2111586
10000	1986-03-31	11960.000	NA	NA	NA	NA	NA	0.2624714
10000	1986-04-30	16330.000	NA	NA	NA	7.897668	NA	0.3269873
10000	1986-05-30	15172.000	NA	NA	NA	8.472954	NA	0.3479495
10000	1986-06-30	11793.859	NA	NA	NA	8.250098	NA	0.3572053
10000	1986-07-31	11734.594	NA	NA	NA	8.113567	NA	0.4227055
10000	1986-08-29	10786.344	NA	NA	NA	8.103863	NA	0.3973749
10000	1986-09-30	4148.594	NA	NA	NA	8.103882	NA	0.3032582
10000	1986-10-31	3911.531	NA	NA	NA	8.112181	NA	0.3860786
10000	1986-11-28	3002.344	NA	NA	NA	8.205756	NA	0.2561681

norm_unif <-  function(v){                   # This is a function that uniformalises a vector
    v <- v %>% as.matrix()
    return(ecdf(v)(v) -0.5)
}

The original dataset has strong variations in the number of well-defined fields before 1983.
Thus, we truncate the sample before this date.
Before that, we also remove a few predictors not suited for the sorting study (e.g., binary-valued).

data_paths <- data_paths |> filter(Date > "1983-01-01") # Stability filter
data_paths <- data_paths |> arrange(Date, permno)       # Order
dates <- data_paths$Date |> unique() |> sort()          # Vector of dates
features <- specs |> pull(Acronym)                        # Predictors / firm characteristics 
features <- setdiff(features, c("age", "convind", "divi", "divo", "ms", "nincr", "ps", "rd", "rd_sale", "realestate", "securedind", "sin", "dy"))
data_paths <- data_paths |>
    select(all_of(c("permno", "Date", "Return", features))) |> 
    ungroup()

3 The modules

Empirical work in the paper is modelled as chains of modules.

module_variable <- function(data, variable){
    #save(variable, file = paste0(Sys.time(), "_variable.RData"))
    if(!(variable %in% c("mvel1", "retvol"))){
        return(data |> 
                   dplyr::select(all_of(c("permno", "Date", "Return", "mvel1", "retvol", variable))) |>
                   purrr::set_names(c("permno", "Date", "Return", "mvel1", "retvol", "variable")))
    }
    if(variable == "mvel1"){
        return(data |> 
                   dplyr::select(all_of(c("permno", "Date", "Return", "mvel1", "retvol"))) |>
                   mutate(variable = mvel1))
    }
    if(variable == "retvol"){
        return(data |> 
                   dplyr::select(all_of(c("permno", "Date", "Return", "mvel1", "retvol"))) |>
                   mutate(variable = retvol))
    }    
}

impute <- function(v){
    v[is.na(v)] <- median(v, na.rm = T)
    return(v)
}

module_imputation <- function(data, imputation){
    if(imputation == 0){return(data)}
    if(imputation == 1){
        return(
            data |> 
                group_by(Date) |>
                mutate(variable = impute(variable)) |>
                ungroup()
        )
    }
}

module_horizon <- function(data, horizon){
    vol_thresh <- 0.001 # hard-coded!
    if(horizon == 1){return(data |> na.omit() |> filter(retvol > vol_thresh))}
    if(horizon == 2){
        return(data |>
                   group_by(permno) |>
                   mutate(Return = (Return + dplyr::lead(Return)) / horizon) |> # Returns are scaled to the monthly scale
                   ungroup() |>
                   na.omit() |> 
                   filter(retvol > vol_thresh)
        )
    }
    if(horizon == 3){
        return(data |>
                   group_by(permno) |>
                   mutate(Return = (Return + dplyr::lead(Return) + dplyr::lead(Return, 2)) / horizon) |>
                   ungroup() |>
                   na.omit() |> 
                   filter(retvol > vol_thresh) 
        )
    }
}

module_start <- function(data, start_quantile){
    L <- nrow(data)
    data[ceiling(L*start_quantile):L, ]
}

module_stop <- function(data, stop_quantile){
    L <- nrow(data)
    data[1:ceiling(L*stop_quantile), ]
}

module_thresh_type <- function(data, thresh_type){
    type <- substr(thresh_type,(nchar(thresh_type)-1),nchar(thresh_type))     # Last 2 letters 
    thresh <- substr(thresh_type, 1, (nchar(thresh_type)-3)) |> as.numeric()
    if(type == "EW"){
        z <- data |> 
            na.omit() |>
            group_by(Date) |>
            mutate(
                tm = quantile(variable, thresh, na.rm = T),
                tp = quantile(variable, 1-thresh, na.rm = T)
            )
        
        z$type <- (z$variable <= z$tm)
        z$type <- z$type - (z$variable >= z$tp)
        z <- z[z$type != 0, ]
        z$weight <- z$type                    # CHECK !!!!!!
        
        z <- z |> 
            pivot_wider(names_from = "type", values_from = "weight") |>
            group_by(Date) |>
            mutate(long = if_else(is.na(`1`), 0, `1`),
                   long = long / sum(long),
                   short = if_else(is.na(`-1`), 0, `-1`),
                   short = short / sum(abs(short)),
                   position = long + short) |>
            ungroup() |>
            mutate(scale = length(unique(Date))/sum(position^2),
                   position2 = position * scale)
        
        fit <- lm(Return ~ position2 - 1, data = z)        
        return(data.frame(m = coef(fit),
                          sd = summary(fit)$coefficients[, 2],
                          sd_hac = coeftest(fit, vcov. = NeweyWest(fit))[1,2],
                          vol = z |> group_by(Date) |> summarise(r = sum(Return*position)) |> pull(r) |> sd(),
                          n = nrow(z),
                          n_dates = length(unique(z$Date)),
                          aic = AIC(fit)))
    }
    if(type == "VW"){
        z <- data |> 
            filter(is.finite(mvel1), mvel1 > 0) |>  # CHECK !!!!
            na.omit() |>
            group_by(Date) |>
            mutate(
                tm = quantile(variable, thresh, na.rm = T),
                tp = quantile(variable, 1-thresh, na.rm = T)
            )
        
        z$type <- (z$variable <= z$tm)
        z$type <- z$type - (z$variable >= z$tp)
        z <- z[z$type != 0, ]
        z$weight <- z$type*z$mvel1                    # CHECK !!!!!!
        
        z <- z |> 
            pivot_wider(names_from = "type", values_from = "weight") |>
            group_by(Date) |>
            mutate(long = if_else(is.na(`1`), 0, `1`),
                   long = long / sum(long),
                   short = if_else(is.na(`-1`), 0, `-1`),
                   short = short / sum(abs(short)),
                   position = long + short) |>
            ungroup() |>
            mutate(scale = length(unique(Date))/sum(position^2),
                   position2 = position * scale)
        
        fit <- lm(Return ~ position2 - 1, data = z)        
        return(data.frame(m = coef(fit),
                          sd = summary(fit)$coefficients[, 2],
                          sd_hac = coeftest(fit, vcov. = NeweyWest(fit))[1,2],
                          vol = z |> group_by(Date) |> summarise(r = sum(Return*position)) |> pull(r) |> sd(),
                          n = nrow(z),
                          n_dates = length(unique(z$Date)),
                          aic = AIC(fit)))
    }
    if(type == "IW"){
        z <- data |> 
            filter(is.finite(retvol), retvol > 0) |>  # CHECK !!!!
            na.omit() |>
            group_by(Date) |>
            mutate(
                tm = quantile(variable, thresh, na.rm = T),
                tp = quantile(variable, 1-thresh, na.rm = T)
            )
        
        z$type <- (z$variable <= z$tm)
        z$type <- z$type - (z$variable >= z$tp)
        z <- z[z$type != 0, ]
        z$weight <- z$type/z$retvol                    # CHECK !!!!!!
        
        z <- z |> 
            pivot_wider(names_from = "type", values_from = "weight") |>
            group_by(Date) |>
            mutate(long = if_else(is.na(`1`), 0, `1`),
                   long = long / sum(long),
                   short = if_else(is.na(`-1`), 0, `-1`),
                   short = short / sum(abs(short)),
                   position = long + short) |>
            ungroup() |>
            mutate(scale = length(unique(Date))/sum(position^2),
                   position2 = position * scale)
        
        fit <- lm(Return ~ position2 - 1, data = z)        
        return(data.frame(m = coef(fit),
                          sd = summary(fit)$coefficients[, 2],
                          sd_hac = coeftest(fit, vcov. = NeweyWest(fit))[1,2],
                          vol = z |> group_by(Date) |> summarise(r = sum(Return*position)) |> pull(r) |> sd(),
                          n = nrow(z),
                          n_dates = length(unique(z$Date)),
                          aic = AIC(fit)))
    }
    if(type == "CW"){
        z <- data |> 
            na.omit() |>
            group_by(Date) |>
            mutate(variable = norm_unif(variable),
                   sign = sign(variable + rnorm(1)/10^6)) |>
            group_by(Date, sign) |>
            mutate(position = variable / sum(abs(variable)),
                   scale = length(unique(Date))/sum(position^2),
                   position2 = position * scale) 
        
        fit <- lm(Return ~ position2 - 1, data = z)        
        return(data.frame(m = coef(fit),
                          sd = summary(fit)$coefficients[, 2],
                          sd_hac = coeftest(fit, vcov. = NeweyWest(fit))[1,2],
                          vol = z |> group_by(Date) |> summarise(r = sum(Return*position)) |> pull(r) |> sd(),
                          n = nrow(z),
                          n_dates = length(unique(z$Date)),
                          aic = AIC(fit)))        
    }
}

A test below. There are 7 outputs: the mean return, the standard deviation of the estimated average return (IID and HAC), the vol (standard deviation of returns), the sample size, the number of dates and the AIC of the regression.

# A first test.
data_paths |>
    module_variable("absacc") |>
    module_imputation(0) |>
    module_horizon(1) |>
    module_start(1/3) |>
    module_stop(2/3) |>
    module_thresh_type("0.10_EW")

##                      m           sd      sd_hac        vol      n n_dates
## position2 0.0004014327 0.0009473093 0.001917562 0.05446198 191078     187
##                 aic
## position2 -87706.41

The aggregation (and a test).

module_aggregator <- function(variable = variable,
                              imputation = imputation,
                              horizon = horizon,
                              start_quantile = start_quantile,
                              stop_quantile = stop_quantile,
                              thresh_type = thresh_type){
    data_paths |>
        module_variable(variable = variable) |>
        module_imputation(imputation = imputation) |>
        module_horizon(horizon = horizon) |>
        module_start(start_quantile = start_quantile) |>
        module_stop(stop_quantile = stop_quantile) |>
        module_thresh_type(thresh_type = thresh_type)
}

# Test! 
module_aggregator(variable = "absacc",
                  imputation = 0,
                  horizon = 1,
                  start_quantile = 0,
                  stop_quantile = 1/3,
                  thresh_type = "0.10_EW")

##                     m         sd      sd_hac        vol      n n_dates
## position2 0.006979506 0.00090491 0.001133974 0.02614822 143350     168
##                 aic
## position2 -102327.5

Surprisingly, we discovered that slicing the parallelization improves its speed, hence the slightly heavy loop below.
Running the future_map() function over 44K parameters was too slow (and penalizing upon errors).

4 The main run

Below, we specify the layer options and launch the spanning of all paths. For each anomaly, one output file is created with the “*_results_fork.RData*” suffix.
The results for the CW weighting scheme (characteristics-weighted) are run separately with the following parameters:

thresh <- 0.5
type <- “CW”

imputation <- c(0,1)
horizon <- 1:3
start_quantile <- c(0, 1/3, 2/3)
stop_quantile <- c(1/3, 2/3, 1)
thresh <- c(0.1, 0.15, 0.2, 0.25, 0.3)
type <- c("EW", "VW", "IW", "CW")
for(i in 1:length(features)){ # PB 63
    if(i%%5==0){print(i)}
    variable <- features[i]
    pars <- expand_grid(variable, imputation, horizon, start_quantile, stop_quantile, thresh, type) |>
        filter(stop_quantile > start_quantile,
               !(type == "CW" & thresh == 0.1),                    # Remove unnecessary combinations 
               !(type == "CW" & thresh == 0.15),
               !(type == "CW" & thresh == 0.2),
               !(type == "CW" & thresh == 0.25))
    pars2 <- pars |> mutate(thresh_type = paste0(thresh, "_", type)) |>
        select(-thresh, -type)
    
    plan(multisession, workers = 5)                                # Set the number of cores
    if(i%%10==0){tictoc::tic()}                                    # Launch timer
    output <- future_pmap_dfr(pars2,  module_aggregator)           # Parallel routine
    if(i%%10==0){tictoc::toc()}                                    # Stop timer
    results <- cbind(pars, output)                                 # Bind parameters to results
    colnames(results)[8:ncol(results)] <- c("avg_return", "sd", "sd_hac", "vol", "n", "n_dates", "aic")
    save(results, file = paste0(variable, "_results_fork.RData"))    
}

Then, all the output files must be transferred in a separate folder within the main folder. Below, we call it “output”.

results_full <- c()
filenames <- list.files("output_ols", pattern="*.RData", full.names = TRUE)
for(j in 1:length(filenames)){
    load(filenames[j])
    results_full <- bind_rows(results_full, results)
}
results_full <- results_full |> data.frame() |> remove_rownames()
#save(results_full, file = "results_full.RData")

5 Plots

The two numbers below are the average hacking intervals for robustness checks versus all paths.

library(tidyverse)
load("results_full.RData")
res_baseline <- results_full |> 
    mutate(t = avg_return/vol*sqrt(n_dates)) |>
    group_by(variable) |>
    mutate(m = median(t),
           t = if_else(m>0, t, -t),
           m = median(t)) |>
    filter(imputation == 1,
           horizon == 1,
           start_quantile == 0,
           stop_quantile == 1,
           thresh == 0.2,
           type == "EW")

res_robust <- results_full |>
    mutate(t = avg_return/vol*sqrt(n_dates)) |>
    group_by(variable) |>
    mutate(m = median(t),
           t = if_else(m>0, t, -t),
           m = median(t)) |>
    #(imputation == 1, horizon == 1, start_quantile == 0, stop_quantile == 1, thresh == 0.2, type == "EW") |
    filter( (horizon == 1 & start_quantile == 0 & stop_quantile == 1 & thresh == 0.2 & type == "EW") |
                (imputation == 1 & start_quantile == 0 & stop_quantile == 1 & thresh == 0.2 & type == "EW") |
                (imputation == 1 & horizon == 1 & stop_quantile == 1 & thresh == 0.2 & type == "EW") |
                (imputation == 1 & horizon == 1 & start_quantile == 0 & thresh == 0.2 & type == "EW") |
                (imputation == 1 & horizon == 1 & start_quantile == 0 & stop_quantile == 1 & type == "EW") |
                (imputation == 1 & horizon == 1 & start_quantile == 0 & stop_quantile == 1 & thresh == 0.2) 
    ) |>
    group_by(variable) |>
    mutate(min = min(t),
           max = max(t),
           hackint = max - min) |>
    ungroup()

res_robust |> summarise(mean(hackint))

## # A tibble: 1 × 1
##   `mean(hackint)`
##             <dbl>
## 1            4.08

results_full |>
    mutate(t = avg_return/vol*sqrt(n_dates)) |>
    group_by(variable) |>
    mutate(min = min(t),
           max = max(t),
           hackint = max - min) |>
    ungroup() |> 
    summarise(mean(hackint))

## # A tibble: 1 × 1
##   `mean(hackint)`
##             <dbl>
## 1            11.0

Then, we show the boxplots of t-statistics for each firm characteristic.

results_full |>
    mutate(t = avg_return/vol*sqrt(n_dates),
           t = avg_return/sd_hac) |>
    group_by(variable) |>
    mutate(m = median(t),
           t = if_else(m>0, t, -t),
           m = median(t)) |>
    ggplot(aes(y = reorder(variable, m), x = t)) + 
    geom_boxplot() + theme_bw() + 
    geom_vline(xintercept = 0, linetype = 2, color = "red") + 
    geom_vline(xintercept = -2.5, linetype = 3, color = "orange") + 
    geom_vline(xintercept = 2.5, linetype = 3, color = "orange") + 
    theme(axis.title = element_blank()) +
    geom_point(data = res_baseline, color = "red") +
    geom_point(data = res_robust, aes(x = min), color = "blue", shape = 18, size = 2) + 
    geom_point(data = res_robust, aes(x = max), color = "blue", shape = 18, size = 2)

#ggsave("t_stats_anom.pdf", width = 8.4, height = 10)

6 Impact of weighting scheme

Same type of representation, but depending on weighting scheme.

results_full |>
    mutate(t = avg_return/vol*sqrt(n_dates)) |>
    group_by(variable) |>
    mutate(m = median(t),
           t = if_else(m>0, t, -t),
           m = median(t)) |>
    group_by(variable, type) |>
    summarise(t = mean(t)) |>
    mutate(diff = max(t) - min(t)) |>
    ggplot(aes(x = t, y = reorder(variable, diff), color = type)) + 
    geom_line(color = "black") + theme_bw() +
    geom_point() + 
    theme(axis.title.y = element_blank(),
          legend.position = c(0.75, 0.2)) +
    geom_vline(xintercept = 2.5, linetype = 2) +
    scale_color_manual(name = "weighting", values = c("#F74646", "#FFA22B", "#5E51E0", "#000000"),
                       labels = c("Characteristics-based", "Uniform", "Inverse volatility", "Value (market cap)")) +
    xlab("average t-statistic")

#ggsave("t_weighting.pdf", width = 8.4, height = 10)

Let’s see how that translates in terms of cdf.

results_full |>
    mutate(t = avg_return/vol*sqrt(n_dates)) |>
    group_by(variable) |>
    mutate(m = median(t),
           t = if_else(m>0, t, -t)  ) |>
    ungroup() |>
    group_by(type) |>
    mutate(v = ecdf(t)(t)) |>
    ggplot(aes(x = t, y = v, color = type)) + #geom_histogram(position = "dodge") + 
    geom_line() + theme_bw() +
    scale_color_manual(values = c("#FE8C1A", "#4F99D0", "#CC2222", "#000000"), name = "weighting") +
    theme(legend.position = c(0.25,0.7),
          legend.title = element_text(face = "bold"),
          axis.title.y = element_blank()) +
    xlab("t-statistic") +
    geom_vline(xintercept = 7.6, linetype = 2) +
    annotate("text", x = 9, y = 0.7, label = TeX("max t-stat \n for VW \n portfolios"))

ggsave("t_cdf.pdf", width = 8, height = 4)

7 p-value distribution & vicious hacking

The next step is to extract the histogram values on the [0,1/2] interval. Because we work with bins with 0.1 width, this makes only 5 values.

n_i <- function(v, bins){         # Function that extracts the values
    temp <- hist(v, breaks = bins, plot = F)
    temp$counts
}

n <- 10
bins <- (0:n)/n

table_counts <- results_full |>
    mutate(t = sqrt(n_dates)*avg_return/vol) |>
    group_by(variable) |>
    mutate(m = median(t),
           t = if_else(m>0, t, -t)  ) |>
    ungroup() |>
    mutate(t = 1-pnorm(t)) |>
    group_by(variable) |>
    summarise(counts = n_i(t, bins)) |>
    mutate(quantile = row_number()) |>
    pivot_wider(names_from = variable, values_from = counts) |>
    data.matrix()

crit_1 <- 4

table_counts <- table_counts + 1 # to avoid issues when counts are zero


crit <- (lead(table_counts) / table_counts ) / matrix(rep(1:nrow(table_counts), ncol(table_counts)),
                                                      nrow = nrow(table_counts),
                                                      ncol = ncol(table_counts),
                                                      byrow = F)

criteria <- apply(crit[1:crit_1, -1], 2, mean, na.rm = T)  |>
    t() |>
    as_tibble(rownames = NA) |>
    pivot_longer(cols = everything(), names_to = "variable", values_to = "crit") |>        
    mutate(criterion = if_else(crit < 0.25 , "unnecessary",
                               if_else(crit > 0.4, "problematic", "possible")),
           criterion = recode_factor(criterion, 
                                     `unnecessary` = "unnecessary",
                                     `possible` = "possible",
                                     `problematic` = "problematic",
                                     .ordered = T
           )) 

criteria |> group_by(criterion) |> count()

## # A tibble: 3 × 2
## # Groups:   criterion [3]
##   criterion       n
##   <ord>       <int>
## 1 unnecessary    38
## 2 possible       28
## 3 problematic    15

Let’s finally have a look at the distribution of p-values, sorting variable by sorting variable.

results_full |>
    mutate(t = sqrt(n_dates)*avg_return/vol) |>
    group_by(variable) |>
    mutate(m = median(t),
           t = if_else(m>0, t, -t)  ) |>
    ungroup() |>
    mutate(t = 1-pnorm(t)) |>
    left_join(criteria, by = "variable") |>
    ggplot(aes(x = t, fill = criterion)) + geom_vline(xintercept = 0.5, linetype = 2) +  
    geom_histogram(breaks = bins) + 
    facet_wrap(vars(variable), ncol = 8, scales = "free_y") +
    theme_bw() +
    scale_x_continuous(breaks = c(0, 0.5, 1)) +
    scale_fill_manual(values = c("#115782", "#FF9D36", "#E7002E"), name = "vicious p-hacking is:") + 
    theme(axis.title = element_blank(),
          legend.position = c(0.5, 0.03),
          legend.title = element_text(face = "bold"),
          strip.text = element_text(size = 6),
          strip.background = element_rect(fill = "#E1EBF0")) +
    guides(fill = guide_legend(nrow = 1, byrow = TRUE))

ggsave("pval_dist.pdf", width = 8, height = 10)

8 Comparing periods

library(ggrepel)
library(patchwork) 
res_per1 <- results_full |>
    mutate(period = case_when(
        start_quantile == 0 & stop_quantile == 1/3 ~ "Period 1",
        start_quantile == 1/3 & stop_quantile == 2/3 ~ "Period 2",
        start_quantile == 2/3 & stop_quantile == 1 ~ "Period 3",
        .default = "Other"
    )) |>
    group_by(variable, period) |>
    summarise(avg_return = mean(avg_return)) |>
    ungroup() #|> filter(avg_return > -0.1) 

c1 <- (res_per1 |> 
           filter(period %in% c("Period 1", "Period 2")) |> 
           pivot_wider(names_from = period, values_from = avg_return) |>
           select(-1) |>
           data.matrix() |>
           cor(use = "complete.obs"))[1,2]

g1 <- res_per1 |>
    pivot_wider(names_from = "period", values_from = "avg_return") %>% {
        ggplot(., aes(x = `Period 1`, y = `Period 2`)) + geom_smooth(method = "lm", se = F, color = "#B0D1FA") + 
            geom_hline(yintercept = 0, color = "#CDCDCD", linetype = 2) + 
            geom_vline(xintercept = 0, color = "#CDCDCD", linetype = 2) + geom_point() + 
            theme_bw() + theme(axis.title = element_text(face = "bold"),
                               panel.grid = element_blank()) + 
            geom_point(data = filter(., abs(`Period 1`) > 0.01), color = "red") + 
            geom_text_repel(data = filter(., abs(`Period 1`) > 0.01), aes(label = variable)) +
            annotate("text", label = paste("Correlation =", round(c1,3)), x = 0.01, y = -0.005, color =  "#568FCB", fontface = 2)
    }
res_per2 <- results_full |>
    mutate(period = case_when(
        start_quantile == 0 & stop_quantile == 1/3 ~ "Period 1",
        start_quantile == 1/3 & stop_quantile == 2/3 ~ "Period 2",
        start_quantile == 2/3 & stop_quantile == 1 ~ "Period 3",
        .default = "Other"
    )) |>
    group_by(variable, period) |>
    summarise(avg_return = mean(avg_return)) |>
    ungroup() #|> filter(avg_return > -0.1)

c2 <- (res_per2 |> 
           filter(period %in% c("Period 2", "Period 3")) |> 
           pivot_wider(names_from = period, values_from = avg_return) |>
           select(-1) |>
           data.matrix() |>
           cor(use = "complete.obs"))[1,2]

g2 <- res_per2 |>
    pivot_wider(names_from = "period", values_from = "avg_return") %>% {
        ggplot(., aes(x = `Period 2`, y = `Period 3`)) + geom_smooth(method = "lm", se = F, color = "#B0D1FA") + 
            geom_hline(yintercept = 0, color = "#CDCDCD", linetype = 2) + 
            geom_vline(xintercept = 0, color = "#CDCDCD", linetype = 2) + geom_point() + 
            theme_bw() + theme(axis.title = element_text(face = "bold"),
                               panel.grid = element_blank()) + 
            geom_point(data = filter(., `Period 2` > 0.004 | `Period 2` < (-0.005)), color = "red") + 
            geom_text_repel(data = filter(., `Period 2` > 0.004 | `Period 2` < (-0.005)), aes(label = variable)) +
            annotate("text", label = paste("Correlation =", round(c2,3)), x = 0.003, y = -0.005, color =  "#568FCB", fontface = 2)
    }
g1 / g2

ggsave("oos.pdf", width = 8, height = 6)

A focus on the acc characteristic

thresh <- 0.2
data_paths |> 
    select(permno, Date, Return, acc,  mvel1, retvol) |>
    mutate(variable = acc) |>
    na.omit() |>
    group_by(Date) |> 
    mutate(type = case_when(
        variable <= quantile(variable, thresh, na.rm = T) ~ "Short",
        variable >= quantile(variable, 1 - thresh, na.rm = T) ~ "Long")
    ) |>                       
    # mutate(type = if_else(variable < quantile(variable, thresh, na.rm = T), "Short",
    #                       if_else(variable > quantile(variable, 1 - thresh, na.rm = T), "Long", "Neutral"))) |>
    pivot_wider(names_from = "type", values_from = "Return") |>
    mutate(is_Long = is.finite(Long),
           is_Short = is.finite(Short)) |>
    group_by(Date) |>
    summarize(ret = sum(Long/retvol, na.rm = T)/sum(is_Long/retvol, na.rm = T) - 
                  sum(Short/retvol, na.rm = T)/sum(is_Short/retvol, na.rm = T)) |>
    summarise(m = mean(ret, na.rm = T),
              s = sd(ret, na.rm = T),
              n = length(ret))

## # A tibble: 1 × 3
##          m      s     n
##      <dbl>  <dbl> <int>
## 1 -0.00254 0.0227   467

thresh <- 0.2
z <- data_paths |> 
    select(permno, Date, Return, absacc,  mvel1, retvol) |>
    filter(is.finite(retvol), retvol > 0) |>  # CHECK !!!!
    mutate(variable = absacc) |>
    na.omit() |>
    group_by(Date) |>
    mutate(
        tm = quantile(variable, thresh, na.rm = T),
        tp = quantile(variable, 1-thresh, na.rm = T)
    ) |>
    ungroup()

z$type <- (z$variable >= z$tp)
z$type <- z$type - (z$variable <= z$tm) 
z <- z[z$type != 0, ]
z$weight <- z$type#/z$retvol      # CHECK !!!!!!

z <- z |> 
    pivot_wider(names_from = "type", values_from = "weight") |>
    group_by(Date) |>
    mutate(long = if_else(is.na(`1`), 0, `1`),
           long = long / sum(long),
           short = if_else(is.na(`-1`), 0, `-1`),
           short = short / sum(abs(short)),
           position = long + short) |>
    ungroup() |>
    mutate(scale = length(unique(Date))/sum(position^2),
           position2 = position * scale)


fit <- lm(Return ~ position2 - 1, data = z)

# z <- data_paths |> 
#     select(all_of(c("permno", "Date", "Return", variable))) |>
#     na.omit() |>
#     group_by(Date) |>
#     mutate(variable = norm_unif(!!sym(variable)),
#            sign = sign(variable + rnorm(1)/10^6)) |>
#     group_by(Date, sign) |>
#     mutate(position = variable / sum(abs(variable)),
#            scale = length(unique(Date))/sum(position^2),
#            position2 = position * scale)

9 Comparison with prior work

In this section, we use the data from the Open Source Asset Pricing paper.
We are then able to compare our results to those of the original studies.

9.1 Studies with common predictors

The colors code the ease to replicate indicator defined in the paper. It measures if the values obtained in the published articles can easily be attained by some paths which we generate.

published_anomalies <- read_csv("https://raw.githubusercontent.com/OpenSourceAP/CrossSection/master/SignalDoc.csv")

specs2 <- specs[,c(2,4,5)]
colnames(specs2) <- c("variable", "author", "journal")
specs2 <- specs2 |> 
    separate(journal, sep = ", ", into = c("journal", "year")) |>
    separate(author, sep = " ", into = c("first_auth", "other_auth")) |>
    separate(journal, sep = ",", into = c("year", "journal")) |>
    mutate(first_auth = first_auth |> str_remove(",")) |>
    mutate(id = paste(first_auth, year)) |>
    left_join(
        published_anomalies |> 
            rename(t_stat = `T-Stat`) |>
            select(Authors, Year, Return, t_stat) |>
            separate(Authors, sep = " ", into = c("first_auth", "other_auth")) |>
            mutate(first_auth = first_auth |> str_remove(",")) |>
            mutate(id = paste(first_auth, Year)),
        by = "id") |>
    filter(is.finite(Return), is.finite(t_stat)) |>
    mutate(Return = Return / 100) |>
    select(variable, Return, t_stat, Year, id)

grouped_results <- results_full |>
    group_by(variable) |>
    filter(is.finite(aic), is.finite(avg_return)) |>
    mutate(D = (aic - min(aic)),
           weight = if_else(is.na(D), 0, exp(-D/2) /sum(exp(-D/2), na.rm = T)),
           b = sum(weight * avg_return, na.rm = T)) |>
    summarise(s = sum(weight * sqrt(sd_hac^2+(b-avg_return)^2), na.rm = T),
              b = abs(mean(b)),
              n_paths = n()) |> # CAREFUL !!!
    left_join(specs2, by = "variable") |>
    na.omit() 

quant <- 0.9
res_plot <- results_full |>
    left_join(grouped_results, by = "variable") |>
    filter(is.finite(Return)) |>
    group_by(variable) |>
    mutate(m = mean(avg_return),
           sd = sd(avg_return),
           avg_return = if_else(m > 0, avg_return, -avg_return),
           m = mean(avg_return),
           q = pnorm(Return, mean = m, sd = sd),
           EtR = 1-if_else(q>quant, (q-quant)/(1-quant), 0),
           min_q = min(q)) |>
    filter(q == min_q) 
res_plot |>
    ggplot(aes(x = avg_return, y = reorder(variable, b))) + geom_vline(xintercept = 0, linetype = 2) + 
    geom_point(color = "#AAAAAA", size = 0.7) + theme_bw() +
    #geom_errorbar(aes(xmin = m - 2.58*sd/sqrt(n), xmax = m + 2.58*sd/sqrt(n)), color = "#2266DD") +
    geom_errorbar(aes(xmin = b - 2.58*s/sqrt(n_paths), xmax = b + 2.58*s/sqrt(n_paths)), size = 0.3) +
    geom_point(aes( x = Return, y = variable), color = "red", shape = 17) +
    xlab("Coefficient (average return)") +
    geom_label(data = res_plot |> group_by(variable) |> summarise(q = mean(EtR)), size = 3.5,
               label.padding = unit(0.12, "lines"),
               aes(y = variable, x = 0.028, label = paste0(round(100*q, 1), "%"), fill = q), alpha = 0.4) + 
    theme(axis.title.y = element_blank(),
          legend.position = "none",
          axis.title.x = element_text(face = "bold")) +
    scale_fill_distiller(palette = "RdYlGn", direction = 1)

ggsave("comparison.pdf", width = 8, height = 5)

9.2 Distribution for all Sharpe ratios

results_full |>
    ggplot(aes(x = avg_return/vol)) + geom_histogram() + 
    facet_wrap(vars(variable), scales = "free") + theme_bw()

9.3 Focus on the acc predictor

We plot the distribution of the average return from the paths, along with Gaussian and Student fit.
Lastly, we locate the value of the original study, much to the right of the histogram.

results_acc <- results_full |> 
    filter(variable == "acc") |>
    mutate(D = (aic - min(aic)),
           weight = if_else(is.na(D), 0, exp(-D/2) /sum(exp(-D/2), na.rm = T)),
           b = sum(weight * avg_return, na.rm = T),
           s = sum(weight * sqrt(sd_hac^2+(b-avg_return)^2), na.rm = T))
b <- results_acc$b[1]
s <- results_acc$s[1]
stu <- function(x){
    dt((x-mean(results_acc$avg_return))/sd(results_acc$avg_return), df = 3)/sd(results_acc$avg_return)
}

#pt((0.008666667-mean(results_acc$avg_return))/sd(results_acc$avg_return), df = 3) # Student quantile
#qnorm(0.9, mean(results_acc$avg_return), sd(results_acc$avg_return))
#pnorm(0.008666667, mean(results_acc$avg_return), sd(results_acc$avg_return))

results_acc  |>    
    ggplot(aes(x = avg_return)) + geom_vline(xintercept = 0, color = "#CCCCCC", linetype = 2) + 
    geom_histogram(aes(y = ..density..), alpha = 0.4) + theme_bw() +
    xlab("Average return") + xlim(-0.0027, 0.009) +
    theme(axis.title.x = element_text(face = "bold"),
          axis.title.y = element_blank(),
          panel.grid = element_blank()) +
    geom_function(fun = dnorm, 
                  args = list(mean = mean(results_acc$avg_return),
                              sd = sd(results_acc$avg_return)),
                  color = "#2288DD"
    ) +
    geom_vline(xintercept = qnorm(0.9, mean(results_acc$avg_return), sd(results_acc$avg_return)), color = "#999999") + 
    geom_function(fun = stu) +
    geom_segment(x = 0.008666667, xend = 0.008666667, y = 60, yend = 5, 
                 arrow = arrow(length = unit(0.2, "cm")), color = "red") +
    geom_label(aes(x = 0.0058, y = 230), label = "Gaussian \n 90% \n threshold", size = 3.5) +
    #geom_segment(x = b-2.58*s/sqrt(576), xend = b+2.58*s/sqrt(576), y = 250, yend = 250) + 
    annotate("text", x = 0.0086, y = 80, label = "original \n study", color = "red", fontface = 2) +
    annotate("text", x = 0.001, y = 162, label = "fitted \n Gaussian", color = "#2288DD", fontface = 2) +
    annotate("text", x = -0.001, y = 60, label = "fitted \n Student", color = "#000000", fontface = 2)

# ggsave("acc.pdf", width = 8, height = 4)

Forking paths - second study