The many shades of Fama-MacBeth

1 Introduction

This notebook generates the results of the third study in the paper Forking paths in empirical studies available on SSRN.

The code is folded to ease readability. Click on the corresponding buttons to access it.

2 Data & packages

First we load the few packages we will need.
Then, we download the data locally. We do this to speed up computation times later on so that each call does not need to fetch the data all over again.
While we could build a large tidy dataset with all portfolios returns (across dates, assets and asset types), we rely on a brute force approach that stores samples separately. This in fact save some time, as we will not have to filter and pivot the data to a wide format: these steps, when repeated many fold, are time-consuming.

NOTE: the data from Ken French’s website is subject to updates and potential instability, see Noisy Factors.

The data fetching script (FM_data.qmd) is available upon request.

Show the code
library(tidyverse)
library(quantmod)
library(DescTools)
# source(knitr::purl("FM_data.qmd", quiet=TRUE))

load("FF_factors_monthly.RData")

winso_mat <- function(m, q){
    q_min <- quantile(m[,], q, na.rm = T)
    q_max <- quantile(m[,], 1-q, na.rm = T)
    m[m < q_min] <- q_min
    m[m > q_max] <- q_max
    m
}

3 The functions

Show the code
fetch_asset <- function(freq, asset_file, weight, winso_1){
    load(paste0(asset_file, "_", freq, ".RData"))
    load(paste0("FF_factors_", freq, ".RData"))
    assets <- assets |> filter(weighting == weight) 
    assets <- bind_cols(assets |> select(date, weighting),                 # Winsorising stage
                        assets |> 
                            select(-date, -weighting) |> 
                            data.matrix() |> winso_mat(winso_1) |>
                            data.frame())
    left_join(assets, FF_factors |> select(-RF), by = "date") |>   # Joining stage
        filter(is.finite(SMB))    
}

pass_1 <- function(data, factors){                               # This function performs the first pass
    nb_factors <- length(factors)           
    asset_names <- colnames(data |> select(-all_of(c("date", "weighting", factors))))   # Keep names of assets
    models <- lapply(paste(asset_names, ' ~  Mkt_RF + SMB + HML + RMW + CMA'),  # Specification of models
                     function(f){lm(as.formula(f), data = data) %>%             # Calling lm(.)
                             summary() %>%                                      # Gathering the output
                             "$"(coef) %>%                                      # Keeping only the coefs
                             data.frame() %>%                                   # Convert to dataframe
                             select(Estimate)}                                  # Keep the coefficients
    )
    
    betas <- matrix(unlist(models), ncol = nb_factors + 1, byrow = T) %>%       # Extracting the betas
        data.frame(row.names = asset_names)                                     # Formatting: row names
    colnames(betas) <- c("Constant", "Mkt_RF", "SMB", "HML", "RMW", "CMA")      # Formatting: col names
    betas   
}

pass_2 <- function(FM_data, reg_type){
    factors <- c("Mkt_RF", "SMB", "HML", "RMW", "CMA")                        # Fixed for the whole study
    nb_factors <- length(factors)
    dates <- FM_data |> select(-all_of(factors)) |> colnames()
    if(reg_type != "static"){
        dates <- max(dates)
    }
    models <- lapply(paste("`", dates, "`", ' ~  Mkt_RF + SMB + HML + RMW + CMA', sep = ""),
                     function(f){ 
                         temp <- lm(as.formula(f), data = FM_data, y = T)   # Calling lm(.)
                         temp |> summary() %>%                              # Gathering the output
                             "$"(coef) %>%  data.frame() %>%                # Convert coefs to dataframe
                             select(Estimate) |>
                             bind_rows(tibble(Estimate = AIC(temp)),        # Add AIC & sd below
                                       tibble(Estimate = summary(temp)$sigma),
                                       tibble(Estimate = nrow(FM_data)),
                                       tibble(Estimate = sum(temp$residuals^2)),
                                       tibble(Estimate = var(temp$y)))
                     }                                  
    )
    gammas <- matrix(unlist(models), ncol = nb_factors + 6, byrow = T) %>%  # Switch to dataframe
        data.frame(row.names = dates)                                       # & set row names
    colnames(gammas) <- c("Constant", "Mkt_RF", "SMB", "HML", "RMW", "CMA", 
                          "aic", "sd", "sample_size", "sum_sq_err", "var_y") # Set col names
    gammas
}

4 The modules

Show the code
module_data <- function(asset_file, weight, winso_1){
    assets_monthly <- fetch_asset("monthly", asset_file, weight, winso_1)
    assets_daily <- fetch_asset("daily", asset_file, weight, winso_1)
    list(assets_monthly = assets_monthly,
         assets_daily = assets_daily)
}

module_FM <- function(dates_all, reg_type, winso_2, data){
    data_monthly <- data$assets_monthly
    data_daily <- data$assets_daily
    factors <- c("Mkt_RF", "SMB", "HML", "RMW", "CMA")              # Fixed for the whole study
    if(reg_type != "static"){                                       # Sample size for rolling regressions
        sample_size <- substr(reg_type, nchar(reg_type)-1, nchar(reg_type)) |> as.numeric()
        data_monthly <- data_monthly |> filter(date <= dates_all) |> tail(sample_size)
        data_daily <- data_daily |> filter(date <= dates_all) |> tail(sample_size*5)
    }
    data_monthly <- data_monthly[, colMeans(is.na(data_monthly)) == 0]     # Keep only assets with all observations
    data_daily <- data_daily[, colMeans(is.na(data_daily)) == 0]           # Keep only assets with all observations
    # Below we remove numeric columns with no variations
    data_monthly <- data_monthly[,c(T,T, apply(data_monthly[3:ncol(data_monthly)], 2, sd) > 0.001)]
    data_daily <- data_daily[,c(T,T, apply(data_daily[3:ncol(data_daily)], 2, sd, ) > 0.001)]
    asset_names_m <- data_monthly |> select(-all_of(c("date", "weighting", factors))) |> colnames()
    asset_names_d <- data_daily |> select(-all_of(c("date", "weighting", factors))) |> colnames()
    asset_names <- intersect(asset_names_d, asset_names_m)                 # Same names for daily & monthly
    data_daily <- data_daily |> select(all_of(c("date", "weighting", factors, asset_names)))
    data_monthly <- data_monthly |> select(all_of(c("date", "weighting", factors, asset_names)))
    betas_monthly <- pass_1(data_monthly, factors) |> data.matrix() |>     # First pass      
        winso_mat(winso_2) |> data.frame()                                 # Winsorize betas
    betas_daily <- pass_1(data_daily, factors) |> data.matrix() |>         # First pass      
        winso_mat(winso_2) |> data.frame()                                 # Winsorize betas
    loadings_m <- betas_monthly |> select(-Constant)                       # Keep only betas 
    loadings_d <- betas_daily |> select(-Constant)                         # Keep only betas 
    returns_m <- data_monthly |>                                           # Start from returns
        select(all_of(asset_names)) |>                                     # Keep the returns only
        data.frame(row.names = data_monthly$date) |>                       # Set row names
        t()
    returns_m <- returns_m[,apply(returns_m, 2, sd, ) > 0.001]             # Remove constant cols (data glitch)
    FM_data_m <- cbind(loadings_m, returns_m)
    FM_data_d <- cbind(loadings_d, returns_m)
    bind_rows(
        pass_2(FM_data_m, reg_type) |>                                   # Second pass
            rownames_to_column("date") |> mutate(freq = "monthly"), 
        pass_2(FM_data_d, reg_type) |>                                   #  Second pass, on monthly returns!
            rownames_to_column("date") |> mutate(freq = "daily")
    ) |>
        mutate(reg_type = reg_type,
               winso_2 = winso_2,
               n_assets = length(asset_names))
}

FM_full <- function(asset_file = "data_yahoo",
                    weight = "EW",
                    winso_1 = 0.01,
                    pars_2 = pars_2
){
    print(asset_file)
    data <- module_data(
        asset_file = asset_file,
        weight = weight,
        winso_1 = winso_1
    ) 
    pmap_dfr(
        list(dates_all = pars_2$dates_all,
             reg_type = pars_2$reg_type,
             winso_2 = pars_2$winso_2), 
        module_FM,
        data = data
    ) |>
        mutate(asset_file = asset_file,
               weight = weight,
               winso_1 = winso_1)
}

module_data(
    asset_file = "Sorted_25",
    weight = "EW",
    winso_1 = 0.02
) %>% # OLD PIPE !
    module_FM(
        dates_all = "1987-10-31" |> as.Date(),
        reg_type = "dynamic_24",
        winso_2 = 0.02,
        data = .)
date Constant Mkt_RF SMB HML RMW CMA aic sd sample_size sum_sq_err var_y freq reg_type winso_2 n_assets
1987-09-30 0.0019265 -0.0159170 0.0132091 0.0078578 -0.0152435 -0.0065291 -157.5279 0.0089843 25 0.0015336 0.000127 monthly dynamic_24 0.02 25
1987-09-30 -0.0039317 0.0740385 -0.0098788 -0.0159272 -0.0407886 -0.0007061 -155.5813 0.0093410 25 0.0016578 0.000127 daily dynamic_24 0.02 25

Next, we span all path configurations.

Show the code
asset_file <- c("Sorted_25", "Sorted_100", "Industry_12", "Industry_48", "data_yahoo")
#asset_file <- c("Sorted_100")
weight <- c("EW", "VW")
winso_1 <- c(0, 0.01, 0.02)
#winso_1 <- c(0.02)
pars_1 <- expand_grid(asset_file, weight, winso_1) |>
    filter(!(asset_file == "data_yahoo" & weight == "VW"))


reg_type <- c("static", "dynamic_24", "dynamic_60")
#reg_type <- c("static", "dynamic_24")
dates_all <- FF_factors$date
dates_all <- dates_all[dates_all > "1975-01-01"]
winso_2 <- c(0, 0.01, 0.02)
#winso_2 <- c(0.02)
pars_2 <- expand_grid(dates_all, reg_type, winso_2) |>
    mutate(last_day_month = ceiling_date(ymd(dates_all), 'month') - days(1)) |>
    filter(!(reg_type == "static" & (dates_all < max(dates_all)))) |>
    filter(dates_all > "1975-01-01") |>
    select(-last_day_month)

# tictoc::tic()
# output <- FM_full(
#         asset_file = "Sorted_25",
#         weight = "EW",
#         winso_1 = 0.01,
#         pars_2 = pars_2)
# tictoc::toc()
Show the code
library(furrr)
plan(multisession, workers = 5)             # Set the number of cores
tictoc::tic()
output <- future_pmap_dfr(pars_1, 
                          FM_full,
                          pars_2 = pars_2)    # Launch! 7 hours on 2019 iMac 
tictoc::toc()
save(output, file = "output.RData")

A snapshot at the shape of the results:

Show the code
library(tidyverse)
library(DescTools)
load("output.RData")
head(output)
date Constant Mkt_RF SMB HML RMW CMA aic sd sample_size sum_sq_err var_y freq reg_type winso_2 n_assets asset_file weight winso_1
1975-01-31 -0.0162435 0.1678035 0.1273486 0.0820008 -0.0043180 -0.0181090 -187.8139 0.0049026 25 0.0004567 0.0043537 monthly dynamic_24 0.00 25 Sorted_25 EW 0
1975-01-31 -0.0228837 0.1695414 0.1407183 0.0739570 0.0250056 -0.0104215 -145.5609 0.0114137 25 0.0024752 0.0043537 daily dynamic_24 0.00 25 Sorted_25 EW 0
1975-01-31 -0.0000225 0.1525194 0.1284641 0.0829089 -0.0016077 -0.0162033 -180.1899 0.0057101 25 0.0006195 0.0043537 monthly dynamic_24 0.01 25 Sorted_25 EW 0
1975-01-31 -0.0258171 0.1737020 0.1395263 0.0740132 0.0299035 -0.0099173 -145.4434 0.0114406 25 0.0024869 0.0043537 daily dynamic_24 0.01 25 Sorted_25 EW 0
1975-01-31 0.0101020 0.1445578 0.1264403 0.0851223 0.0053689 -0.0117975 -169.8246 0.0070255 25 0.0009378 0.0043537 monthly dynamic_24 0.02 25 Sorted_25 EW 0
1975-01-31 -0.0278045 0.1757247 0.1394500 0.0746526 0.0314142 -0.0097861 -144.4232 0.0116764 25 0.0025904 0.0043537 daily dynamic_24 0.02 25 Sorted_25 EW 0

Below, we plot the values of risk premia, averaged annually (and across paths). We provide 2 averaging methods (simple in orange and Bayesian in blue).
In addition, the yellow areas depict the inter-quartile range (across paths).
The Bayesian confidence intervals are indistinguishable from the blue lines.

Show the code
data_graph <- output |>
    mutate(date = as.Date(date),
           year = year(date)) |>
    pivot_longer(Constant:CMA, names_to = "factor", values_to = "premium") |>
    mutate(premium = premium * 12) |>
    filter(factor != "Constant", year > 1975, year < 2023) |>
    group_by(date, factor) |> 
    mutate(prem_winso = Winsorize(premium, probs = c(0.01, 0.99), na.rm = T),
           D = (aic - min(aic)),
           weight_freq = if_else(is.na(D), 0, exp(-D/2) / sum(exp(-D/2), na.rm = T)),
           b_freq = sum(weight_freq * premium, na.rm = T),
           lik = sqrt(sample_size/(sample_size+1))/((sum_sq_err+var_y)/(sqrt(sample_size)+1))^(sqrt(sample_size-1)),
           weight_bayes = lik/sum(lik, na.rm = T),
           b_bayes = sum(premium*weight_bayes, na.rm = T)
    ) |>
    group_by(date, year, factor) |>
    summarise(
        prem_winso = mean(prem_winso),
        b_freq = mean(b_freq),
        b_bayes = mean(b_bayes),
        s_freq = sum(weight_freq * sqrt(sd^2+(b_freq-premium)^2), na.rm = T),
        s_Bayesian = sum(weight_bayes * sqrt(sd^2+(b_bayes-premium)^2), na.rm = T),
        n = n(),
        b_up = b_bayes + 2.58*s_Bayesian/sqrt(n),
        b_down = b_bayes - 2.58*s_Bayesian/sqrt(n),
        prem_25 = quantile(premium, 0.25, na.rm = T),
        prem_75 = quantile(premium, 0.75, na.rm = T)
    ) |>
    group_by(year, factor) |>
    summarise(
        avg_premium = mean(prem_winso, na.rm = T),
        prem_25 = mean(prem_25),
        prem_75 = mean(prem_75),
        b_bayes = mean(b_bayes),
        b_up = mean(b_up),
        b_down = mean(b_down)
    )

data_graph |>
    ggplot(aes(x = year, y = avg_premium)) + 
    geom_ribbon(aes(ymin = prem_25, ymax = prem_75), fill = "#FFBE02", alpha = 0.36) + 
    geom_ribbon(aes(ymin = b_down, ymax = b_up), alpha = 0.7, fill = "#1863B8") + 
    geom_line(color = "#EC8C01", size = 0.6) + theme_bw() + 
    geom_line(aes(y = b_bayes), color = "#255081", size = 0.45) + 
    theme(legend.position = c(0.75,0.15),
          panel.grid.major.x = element_blank(),
          panel.grid.minor = element_blank(),
          strip.text = element_text(face = "bold"),
          strip.background = element_rect(fill = "#E1EFFD", color = "white"),
          legend.title = element_text(face = "bold"),
          axis.title = element_blank()) +
    facet_wrap(vars(factor), ncol = 2, scales = "free") +
    guides(fill=guide_legend(nrow = 2, byrow = TRUE)) 

Show the code
ggsave("FM.pdf", width = 8, height = 6)

Below, we show the frequentist average.
The narrow grey area around the line is the path-driven confidence interval (1% level).

Show the code
output |>
    mutate(date = as.Date(date),
           year = year(date)) |>
    pivot_longer(Constant:CMA, names_to = "factor", values_to = "premium") |>
    filter(factor != "Constant") |>
    group_by(date, factor) |>
    mutate(D = (aic - min(aic)),
           year = year(date),
           weight_freq = if_else(is.na(D), 0, exp(-D/2) / sum(exp(-D/2), na.rm = T)),
           b_freq = sum(weight_freq * premium, na.rm = T)) |>
    group_by(year, factor) |>
    summarise(s_freq = sum(weight_freq / 12 * sqrt(sd^2+(b_freq-premium)^2), na.rm = T),
              n = n(),
              b_freq = mean(b_freq),
              b_up = b_freq + 2.58*s_freq/sqrt(n),
              b_down = b_freq - 2.58*s_freq/sqrt(n)) |>
    ggplot(aes(x = year, y = b_freq)) + 
    geom_ribbon(aes(ymin = b_down, ymax = b_up), color = "grey", alpha = 0.4) + 
    geom_line(linewidth = 0.4) + 
    facet_wrap(vars(factor), ncol = 2) + theme_bw() +
    theme(legend.position = c(0.75,0.15),
          panel.grid.major.x = element_blank(),
          panel.grid.minor = element_blank(),
          strip.text = element_text(face = "bold"),
          strip.background = element_rect(fill = "#E1EFFD", color = "white"),
          legend.title = element_text(face = "bold"),
          axis.title = element_blank()) +
    ylim(-0.05,0.1)

5 Subsample analysis

We take a look a the two chronological halves of the samples and compare the average premia.

Show the code
library(ggrepel)
sep_date <- "1998-06-15"
output |>
    mutate(period = if_else(date < sep_date, "first_period", "second_period")) |>
    pivot_longer(Mkt_RF:CMA, values_to = "value", names_to = "factor") |>
    group_by(period, factor, reg_type) |>
    summarise(r = mean(value)) |>
    pivot_wider(names_from = period, values_from = r) |>
    ggplot(aes(x = first_period, y = second_period)) + geom_smooth(method = "lm", se = F, color = "grey") +
    #geom_smooth(method = "lm", se = F, aes(color = reg_type)) + 
    geom_point(aes(color = reg_type)) +
    theme_classic() + geom_vline(xintercept = 0, linewidth = 0.3) + geom_hline(yintercept = 0, linewidth = 0.3) + 
    geom_text_repel(aes(label = factor)) +
    xlab("first period") + ylab("second period") +
    theme(axis.title = element_text(face = "bold"),
          legend.position = c(0.92,0.8),
          legend.title = element_text(face = "bold")) + 
    scale_color_manual(values = c("#E62D70", "#5485E7", "#F8C509"),
                       name = "regression\n    type",
                       label = c("dyn. short", "dyn. long", "static"))

Show the code
ggsave("FM_comparison.pdf", width = 8, height = 4)    

Then the alternatives for the first pass.

Show the code
output |>
    mutate(asset_file = case_when(asset_file == "data_yahoo" ~ "single stocks", .default = asset_file)) |>
    pivot_longer(Mkt_RF:CMA, values_to = "value", names_to = "factor") |>
    group_by(freq, reg_type, winso_1, winso_2, asset_file, weight, factor) |>
    summarise(value = mean(value)) |>
    pivot_wider(names_from = reg_type, values_from = value) |>
    mutate(asset_file = asset_file |> str_replace("_", " ") |> tolower()) |>
    ggplot(aes(x = static, y = dynamic_60)) + ylab("dynamic first pass (long samples)") +
    geom_vline(xintercept = 0, linewidth = 0.4, color = "grey") + geom_hline(yintercept = 0, linewidth = 0.4, color = "grey") +
    geom_point(size = 0.9, aes(color = asset_file)) + theme_classic() + 
    geom_smooth(se = F, method = "lm", color = "black", linewidth = 0.5, linetype = 2) + 
    scale_color_viridis_d(name = "assets") + xlab("static first pass") + 
    theme(legend.position = c(0.85,0.25),
          axis.title = element_text(face = "bold"),
          legend.title = element_text(face = "bold")) 

Show the code
ggsave("FM_comparison.pdf", width = 8, height = 4)    

Below we look at 3 layers with 3 options and see if they have an impact on the average premium.
First, regression type, then the first winsorization threshold and finally the second one.
The first 3 columns are the average premia and the last three compute 1-2, 1-3 and 2-3 with the significance levels (simple t-tests).

Show the code
starz <- function(p){
    if(p < 0.001){"(***)"} else {
        if(p < 0.01){"(**)"} else{
            if(p<0.05){"(*)"}
        }
    }
}
t1 <- output |>
    pivot_longer(Mkt_RF:CMA, values_to = "value", names_to = "factor") |>
    select(date, freq, reg_type, winso_1, winso_2, asset_file, weight, factor, value) |>
    pivot_wider(names_from = reg_type, values_from = value) |>
    unnest(cols = c(dynamic_24, dynamic_60, static)) |>
    group_by(factor) |>
    summarise(m1 = (mean(dynamic_24, na.rm = T)*100) |> round(3),
              m2 = (mean(dynamic_60, na.rm = T)*100) |> round(3),
              m3 = (mean(static, na.rm = T)*100) |> round(3),
              t1 = paste(round(m1-m2,3), starz(t.test(dynamic_24, dynamic_60)$p.value)),
              t2 = paste(round(m1-m3,3), starz(t.test(dynamic_24, static)$p.value)),
              t3 = paste(round(m2-m3,3), starz(t.test(dynamic_60, static)$p.value)))

t2 <- output |>
    pivot_longer(Mkt_RF:CMA, values_to = "value", names_to = "factor") |>
    select(date, freq, reg_type, winso_1, winso_2, asset_file, weight, factor, value) |>
    pivot_wider(names_from = winso_1, values_from = value) |>
    unnest(cols = c(`0`, `0.01`, `0.02`)) |>
    group_by(factor) |>
    summarise(m1 = (mean(`0`, na.rm = T)*100) |> round(3),
              m2 = (mean(`0.01`, na.rm = T)*100) |> round(3),
              m3 = (mean(`0.02`, na.rm = T)*100) |> round(3),
              t1 = paste(round(m1-m2,3), starz(t.test(`0`, `0.01`)$p.value)),
              t2 = paste(round(m1-m3,3), starz(t.test(`0`, `0.02`)$p.value)),
              t3 = paste(round(m2-m3,3), starz(t.test(`0.01`, `0.02`)$p.value)))

t3 <- output |>
    pivot_longer(Mkt_RF:CMA, values_to = "value", names_to = "factor") |>
    select(date, freq, reg_type, winso_1, winso_2, asset_file, weight, factor, value) |>
    pivot_wider(names_from = winso_2, values_from = value) |>
    unnest(cols = c(`0`, `0.01`, `0.02`)) |>
    group_by(factor) |>
    summarise(m1 = (mean(`0`, na.rm = T)*100) |> round(3),
              m2 = (mean(`0.01`, na.rm = T)*100) |> round(3),
              m3 = (mean(`0.02`, na.rm = T)*100) |> round(3),
              t1 = paste(round(m1-m2,3), starz(t.test(`0`, `0.01`)$p.value)),
              t2 = paste(round(m1-m3,3), starz(t.test(`0`, `0.02`)$p.value)),
              t3 = paste(round(m2-m3,3), starz(t.test(`0.01`, `0.02`)$p.value)))
t1
factor m1 m2 m3 t1 t2 t3
CMA 0.149 0.187 0.264 -0.038 (*) -0.115 (***) -0.077 (**)
HML 0.091 0.110 -0.115 -0.019 0.206 (***) 0.225 (***)
Mkt_RF 0.141 0.147 0.267 -0.006 -0.126 (**) -0.12 (**)
RMW 0.135 0.173 0.418 -0.038 (*) -0.283 (***) -0.245 (***)
SMB 0.180 0.283 0.068 -0.103 (***) 0.112 (***) 0.215 (***)
Show the code
t2
factor m1 m2 m3 t1 t2 t3
CMA 0.430 0.107 0.070 0.323 (***) 0.36 (***) 0.037
HML -0.033 0.038 0.066 -0.071 (**) -0.099 (***) -0.028
Mkt_RF 0.074 0.189 0.298 -0.115 (**) -0.224 (***) -0.109 (*)
RMW 0.194 0.267 0.283 -0.073 (**) -0.089 (***) -0.016
SMB 0.202 0.146 0.173 0.056 (*) 0.029 -0.027
Show the code
t3
factor m1 m2 m3 t1 t2 t3
CMA 0.210 0.204 0.193 0.006 0.017 0.011
HML 0.021 0.023 0.027 -0.002 -0.006 -0.004
Mkt_RF 0.150 0.194 0.223 -0.044 -0.073 -0.029
RMW 0.235 0.253 0.256 -0.018 -0.021 -0.003
SMB 0.164 0.177 0.180 -0.013 -0.016 -0.003
Show the code
library(xtable)
t1 |> bind_rows(t2) |> bind_rows(t3) |> xtable(digits = 3) |> print(include.rownames=FALSE)

6 Comparison with prior work

Fama-MacBeth long term premium: 0.0085. It is shown with the vertical dashed line below.
The other values originate from the paper Using Stocks or Portfolios in Tests of Factor Models.

The values in the rounded rectangles are the “ease-to-replicate” indicator (90% level) defined in the paper.

Show the code
library(latex2exp)
quant <- 0.9

vec_prior <- c(0.0085, 0.0114, 0.0158, 0.0173, 0.0479) # Values from prior studies

dist_FM <- output |>
    group_by(freq, reg_type, winso_1, winso_2, weight, asset_file) |>
    summarise(gamma_1 = mean(Mkt_RF)) 

# Odds of favorable outcome below
ofo <- pnorm(vec_prior, mean = mean(dist_FM$gamma_1), sd = sd(dist_FM$gamma_1))

dist_FM |>
    ggplot(aes(x  = gamma_1)) + 
    theme_bw() + 
    theme(panel.grid = element_blank(),
          axis.title.y = element_blank()) +
    geom_vline(xintercept = 0.0085, linetype = 2, linewidth = 0.8) +
    geom_vline(xintercept = 0.0114) +
    geom_vline(xintercept = 0.0158) + 
    geom_vline(xintercept = 0.0173) + 
    geom_vline(xintercept = 0.0479) +
    geom_histogram(fill = "#999999") + 
    xlab(TeX("$\\bar{\\gamma}_1$")) +
    geom_label(aes(x = vec_prior[1], y = 103, label = paste0(round((1-if_else(ofo[1]>quant, (ofo[1]-quant)/(1-quant), 0))*100),"%")), fill = "#A4D5AA") +
    geom_label(aes(x = vec_prior[2], y = 96, label = paste0(round((1-if_else(ofo[2]>quant, (ofo[2]-quant)/(1-quant), 0))*100),"%")), fill = "grey") +
    geom_label(aes(x = vec_prior[3], y = 103, label = paste0(round((1-if_else(ofo[3]>quant, (ofo[3]-quant)/(1-quant), 0))*100),"%")), fill = "grey") +
    geom_label(aes(x = vec_prior[4], y = 96, label = paste0(round((1-if_else(ofo[4]>quant, (ofo[4]-quant)/(1-quant), 0))*100),"%")), fill = "grey") +
    geom_label(aes(x = vec_prior[5], y = 103, label = paste0(round((1-if_else(ofo[5]>quant, (ofo[5]-quant)/(1-quant), 0))*100),"%")), fill = "#FF4E4E") 

Show the code
# ggsave("FM_prior.pdf", width = 8, height = 4)

Lastly, an illustration of the impact on the assets on the magnitude of premia.

Show the code
output |>
    pivot_longer(Mkt_RF:CMA, values_to = "value", names_to = "factor") |>
    mutate(asset_file = case_when(asset_file == "data_yahoo" ~ "single stocks", .default = asset_file)) |>
    mutate(asset_file = asset_file |> str_replace("_", " ") |> tolower()) |>
    group_by(factor, asset_file) |>
    summarise(abs = mean(abs(value))) |>
    ggplot(aes(x = abs, y = reorder(asset_file, abs), fill = factor)) +
    geom_col(position = "dodge", alpha = 0.7) + theme_classic() +
    theme(axis.title.y = element_blank(),
          axis.title.x = element_text(face = "bold"),
          legend.title = element_text(face = "bold"),
          legend.position = c(0.8,0.5)) +
    xlab("Average absolute premia") +
    scale_fill_brewer(palette = "Spectral")

Show the code
ggsave("FM_assets.pdf", width = 8, height = 4)