The many shades of Fama-MacBeth

1 Introduction

This notebook generates the results of the third study in the paper Forking paths in empirical studies available on SSRN.

The code is folded to ease readability. Click on the corresponding buttons to access it.

2 Data & packages

First we load the few packages we will need.
Then, we download the data locally. We do this to speed up computation times later on so that each call does not need to fetch the data all over again.
While we could build a large tidy dataset with all portfolios returns (across dates, assets and asset types), we rely on a brute force approach that stores samples separately. This in fact save some time, as we will not have to filter and pivot the data to a wide format: these steps, when repeated many fold, are time-consuming.

NOTE: the data from Ken French’s website is subject to updates and potential instability, see Noisy Factors.

The data fetching script (FM_data.qmd) is available upon request.

Show the code

library(tidyverse)
library(quantmod)
library(DescTools)
# source(knitr::purl("FM_data.qmd", quiet=TRUE))

load("FF_factors_monthly.RData")

winso_mat <- function(m, q){
    q_min <- quantile(m[,], q, na.rm = T)
    q_max <- quantile(m[,], 1-q, na.rm = T)
    m[m < q_min] <- q_min
    m[m > q_max] <- q_max
    m
}

3 The functions

Show the code

fetch_asset <- function(freq, asset_file, weight, winso_1){
    load(paste0(asset_file, "_", freq, ".RData"))
    load(paste0("FF_factors_", freq, ".RData"))
    assets <- assets |> filter(weighting == weight) 
    assets <- bind_cols(assets |> select(date, weighting),                 # Winsorising stage
                        assets |> 
                            select(-date, -weighting) |> 
                            data.matrix() |> winso_mat(winso_1) |>
                            data.frame())
    left_join(assets, FF_factors |> select(-RF), by = "date") |>   # Joining stage
        filter(is.finite(SMB))    
}

pass_1 <- function(data, factors){                               # This function performs the first pass
    nb_factors <- length(factors)           
    asset_names <- colnames(data |> select(-all_of(c("date", "weighting", factors))))   # Keep names of assets
    models <- lapply(paste(asset_names, ' ~  Mkt_RF + SMB + HML + RMW + CMA'),  # Specification of models
                     function(f){lm(as.formula(f), data = data) %>%             # Calling lm(.)
                             summary() %>%                                      # Gathering the output
                             "$"(coef) %>%                                      # Keeping only the coefs
                             data.frame() %>%                                   # Convert to dataframe
                             select(Estimate)}                                  # Keep the coefficients
    )
    
    betas <- matrix(unlist(models), ncol = nb_factors + 1, byrow = T) %>%       # Extracting the betas
        data.frame(row.names = asset_names)                                     # Formatting: row names
    colnames(betas) <- c("Constant", "Mkt_RF", "SMB", "HML", "RMW", "CMA")      # Formatting: col names
    betas   
}

pass_2 <- function(FM_data, reg_type){
    factors <- c("Mkt_RF", "SMB", "HML", "RMW", "CMA")                        # Fixed for the whole study
    nb_factors <- length(factors)
    dates <- FM_data |> select(-all_of(factors)) |> colnames()
    if(reg_type != "static"){
        dates <- max(dates)
    }
    models <- lapply(paste("`", dates, "`", ' ~  Mkt_RF + SMB + HML + RMW + CMA', sep = ""),
                     function(f){ 
                         temp <- lm(as.formula(f), data = FM_data, y = T)   # Calling lm(.)
                         temp |> summary() %>%                              # Gathering the output
                             "$"(coef) %>%  data.frame() %>%                # Convert coefs to dataframe
                             select(Estimate) |>
                             bind_rows(tibble(Estimate = AIC(temp)),        # Add AIC & sd below
                                       tibble(Estimate = summary(temp)$sigma),
                                       tibble(Estimate = nrow(FM_data)),
                                       tibble(Estimate = sum(temp$residuals^2)),
                                       tibble(Estimate = var(temp$y)))
                     }                                  
    )
    gammas <- matrix(unlist(models), ncol = nb_factors + 6, byrow = T) %>%  # Switch to dataframe
        data.frame(row.names = dates)                                       # & set row names
    colnames(gammas) <- c("Constant", "Mkt_RF", "SMB", "HML", "RMW", "CMA", 
                          "aic", "sd", "sample_size", "sum_sq_err", "var_y") # Set col names
    gammas
}

4 The modules

Show the code

module_data <- function(asset_file, weight, winso_1){
    assets_monthly <- fetch_asset("monthly", asset_file, weight, winso_1)
    assets_daily <- fetch_asset("daily", asset_file, weight, winso_1)
    list(assets_monthly = assets_monthly,
         assets_daily = assets_daily)
}

module_FM <- function(dates_all, reg_type, winso_2, data){
    data_monthly <- data$assets_monthly
    data_daily <- data$assets_daily
    factors <- c("Mkt_RF", "SMB", "HML", "RMW", "CMA")              # Fixed for the whole study
    if(reg_type != "static"){                                       # Sample size for rolling regressions
        sample_size <- substr(reg_type, nchar(reg_type)-1, nchar(reg_type)) |> as.numeric()
        data_monthly <- data_monthly |> filter(date <= dates_all) |> tail(sample_size)
        data_daily <- data_daily |> filter(date <= dates_all) |> tail(sample_size*5)
    }
    data_monthly <- data_monthly[, colMeans(is.na(data_monthly)) == 0]     # Keep only assets with all observations
    data_daily <- data_daily[, colMeans(is.na(data_daily)) == 0]           # Keep only assets with all observations
    # Below we remove numeric columns with no variations
    data_monthly <- data_monthly[,c(T,T, apply(data_monthly[3:ncol(data_monthly)], 2, sd) > 0.001)]
    data_daily <- data_daily[,c(T,T, apply(data_daily[3:ncol(data_daily)], 2, sd, ) > 0.001)]
    asset_names_m <- data_monthly |> select(-all_of(c("date", "weighting", factors))) |> colnames()
    asset_names_d <- data_daily |> select(-all_of(c("date", "weighting", factors))) |> colnames()
    asset_names <- intersect(asset_names_d, asset_names_m)                 # Same names for daily & monthly
    data_daily <- data_daily |> select(all_of(c("date", "weighting", factors, asset_names)))
    data_monthly <- data_monthly |> select(all_of(c("date", "weighting", factors, asset_names)))
    betas_monthly <- pass_1(data_monthly, factors) |> data.matrix() |>     # First pass      
        winso_mat(winso_2) |> data.frame()                                 # Winsorize betas
    betas_daily <- pass_1(data_daily, factors) |> data.matrix() |>         # First pass      
        winso_mat(winso_2) |> data.frame()                                 # Winsorize betas
    loadings_m <- betas_monthly |> select(-Constant)                       # Keep only betas 
    loadings_d <- betas_daily |> select(-Constant)                         # Keep only betas 
    returns_m <- data_monthly |>                                           # Start from returns
        select(all_of(asset_names)) |>                                     # Keep the returns only
        data.frame(row.names = data_monthly$date) |>                       # Set row names
        t()
    returns_m <- returns_m[,apply(returns_m, 2, sd, ) > 0.001]             # Remove constant cols (data glitch)
    FM_data_m <- cbind(loadings_m, returns_m)
    FM_data_d <- cbind(loadings_d, returns_m)
    bind_rows(
        pass_2(FM_data_m, reg_type) |>                                   # Second pass
            rownames_to_column("date") |> mutate(freq = "monthly"), 
        pass_2(FM_data_d, reg_type) |>                                   #  Second pass, on monthly returns!
            rownames_to_column("date") |> mutate(freq = "daily")
    ) |>
        mutate(reg_type = reg_type,
               winso_2 = winso_2,
               n_assets = length(asset_names))
}

FM_full <- function(asset_file = "data_yahoo",
                    weight = "EW",
                    winso_1 = 0.01,
                    pars_2 = pars_2
){
    print(asset_file)
    data <- module_data(
        asset_file = asset_file,
        weight = weight,
        winso_1 = winso_1
    ) 
    pmap_dfr(
        list(dates_all = pars_2$dates_all,
             reg_type = pars_2$reg_type,
             winso_2 = pars_2$winso_2), 
        module_FM,
        data = data
    ) |>
        mutate(asset_file = asset_file,
               weight = weight,
               winso_1 = winso_1)
}

module_data(
    asset_file = "Sorted_25",
    weight = "EW",
    winso_1 = 0.02
) %>% # OLD PIPE !
    module_FM(
        dates_all = "1987-10-31" |> as.Date(),
        reg_type = "dynamic_24",
        winso_2 = 0.02,
        data = .)

date	Constant	Mkt_RF	SMB	HML	RMW	CMA	aic	sd	sample_size	sum_sq_err	var_y	freq	reg_type	winso_2	n_assets
1987-09-30	0.0019265	-0.0159170	0.0132091	0.0078578	-0.0152435	-0.0065291	-157.5279	0.0089843	25	0.0015336	0.000127	monthly	dynamic_24	0.02	25
1987-09-30	-0.0039317	0.0740385	-0.0098788	-0.0159272	-0.0407886	-0.0007061	-155.5813	0.0093410	25	0.0016578	0.000127	daily	dynamic_24	0.02	25

Next, we span all path configurations.

Show the code

asset_file <- c("Sorted_25", "Sorted_100", "Industry_12", "Industry_48", "data_yahoo")
#asset_file <- c("Sorted_100")
weight <- c("EW", "VW")
winso_1 <- c(0, 0.01, 0.02)
#winso_1 <- c(0.02)
pars_1 <- expand_grid(asset_file, weight, winso_1) |>
    filter(!(asset_file == "data_yahoo" & weight == "VW"))


reg_type <- c("static", "dynamic_24", "dynamic_60")
#reg_type <- c("static", "dynamic_24")
dates_all <- FF_factors$date
dates_all <- dates_all[dates_all > "1975-01-01"]
winso_2 <- c(0, 0.01, 0.02)
#winso_2 <- c(0.02)
pars_2 <- expand_grid(dates_all, reg_type, winso_2) |>
    mutate(last_day_month = ceiling_date(ymd(dates_all), 'month') - days(1)) |>
    filter(!(reg_type == "static" & (dates_all < max(dates_all)))) |>
    filter(dates_all > "1975-01-01") |>
    select(-last_day_month)

# tictoc::tic()
# output <- FM_full(
#         asset_file = "Sorted_25",
#         weight = "EW",
#         winso_1 = 0.01,
#         pars_2 = pars_2)
# tictoc::toc()

Show the code

library(furrr)
plan(multisession, workers = 5)             # Set the number of cores
tictoc::tic()
output <- future_pmap_dfr(pars_1, 
                          FM_full,
                          pars_2 = pars_2)    # Launch! 7 hours on 2019 iMac 
tictoc::toc()
save(output, file = "output.RData")

A snapshot at the shape of the results:

Show the code

library(tidyverse)
library(DescTools)
load("output.RData")
head(output)

date	Constant	Mkt_RF	SMB	HML	RMW	CMA	aic	sd	sample_size	sum_sq_err	var_y	freq	reg_type	winso_2	n_assets	asset_file	weight
1975-01-31	-0.0162435	0.1678035	0.1273486	0.0820008	-0.0043180	-0.0181090	-187.8139	0.0049026	25	0.0004567	0.0043537	monthly	dynamic_24	0.00	25	Sorted_25	EW
1975-01-31	-0.0228837	0.1695414	0.1407183	0.0739570	0.0250056	-0.0104215	-145.5609	0.0114137	25	0.0024752	0.0043537	daily	dynamic_24	0.00	25	Sorted_25	EW
1975-01-31	-0.0000225	0.1525194	0.1284641	0.0829089	-0.0016077	-0.0162033	-180.1899	0.0057101	25	0.0006195	0.0043537	monthly	dynamic_24	0.01	25	Sorted_25	EW
1975-01-31	-0.0258171	0.1737020	0.1395263	0.0740132	0.0299035	-0.0099173	-145.4434	0.0114406	25	0.0024869	0.0043537	daily	dynamic_24	0.01	25	Sorted_25	EW
1975-01-31	0.0101020	0.1445578	0.1264403	0.0851223	0.0053689	-0.0117975	-169.8246	0.0070255	25	0.0009378	0.0043537	monthly	dynamic_24	0.02	25	Sorted_25	EW
1975-01-31	-0.0278045	0.1757247	0.1394500	0.0746526	0.0314142	-0.0097861	-144.4232	0.0116764	25	0.0025904	0.0043537	daily	dynamic_24	0.02	25	Sorted_25	EW

Below, we plot the values of risk premia, averaged annually (and across paths). We provide 2 averaging methods (simple in orange and Bayesian in blue).
In addition, the yellow areas depict the inter-quartile range (across paths).
The Bayesian confidence intervals are indistinguishable from the blue lines.

Show the code

data_graph <- output |>
    mutate(date = as.Date(date),
           year = year(date)) |>
    pivot_longer(Constant:CMA, names_to = "factor", values_to = "premium") |>
    mutate(premium = premium * 12) |>
    filter(factor != "Constant", year > 1975, year < 2023) |>
    group_by(date, factor) |> 
    mutate(prem_winso = Winsorize(premium, probs = c(0.01, 0.99), na.rm = T),
           D = (aic - min(aic)),
           weight_freq = if_else(is.na(D), 0, exp(-D/2) / sum(exp(-D/2), na.rm = T)),
           b_freq = sum(weight_freq * premium, na.rm = T),
           lik = sqrt(sample_size/(sample_size+1))/((sum_sq_err+var_y)/(sqrt(sample_size)+1))^(sqrt(sample_size-1)),
           weight_bayes = lik/sum(lik, na.rm = T),
           b_bayes = sum(premium*weight_bayes, na.rm = T)
    ) |>
    group_by(date, year, factor) |>
    summarise(
        prem_winso = mean(prem_winso),
        b_freq = mean(b_freq),
        b_bayes = mean(b_bayes),
        s_freq = sum(weight_freq * sqrt(sd^2+(b_freq-premium)^2), na.rm = T),
        s_Bayesian = sum(weight_bayes * sqrt(sd^2+(b_bayes-premium)^2), na.rm = T),
        n = n(),
        b_up = b_bayes + 2.58*s_Bayesian/sqrt(n),
        b_down = b_bayes - 2.58*s_Bayesian/sqrt(n),
        prem_25 = quantile(premium, 0.25, na.rm = T),
        prem_75 = quantile(premium, 0.75, na.rm = T)
    ) |>
    group_by(year, factor) |>
    summarise(
        avg_premium = mean(prem_winso, na.rm = T),
        prem_25 = mean(prem_25),
        prem_75 = mean(prem_75),
        b_bayes = mean(b_bayes),
        b_up = mean(b_up),
        b_down = mean(b_down)
    )

data_graph |>
    ggplot(aes(x = year, y = avg_premium)) + 
    geom_ribbon(aes(ymin = prem_25, ymax = prem_75), fill = "#FFBE02", alpha = 0.36) + 
    geom_ribbon(aes(ymin = b_down, ymax = b_up), alpha = 0.7, fill = "#1863B8") + 
    geom_line(color = "#EC8C01", size = 0.6) + theme_bw() + 
    geom_line(aes(y = b_bayes), color = "#255081", size = 0.45) + 
    theme(legend.position = c(0.75,0.15),
          panel.grid.major.x = element_blank(),
          panel.grid.minor = element_blank(),
          strip.text = element_text(face = "bold"),
          strip.background = element_rect(fill = "#E1EFFD", color = "white"),
          legend.title = element_text(face = "bold"),
          axis.title = element_blank()) +
    facet_wrap(vars(factor), ncol = 2, scales = "free") +
    guides(fill=guide_legend(nrow = 2, byrow = TRUE))

Show the code

ggsave("FM.pdf", width = 8, height = 6)

Below, we show the frequentist average.
The narrow grey area around the line is the path-driven confidence interval (1% level).

Show the code

output |>
    mutate(date = as.Date(date),
           year = year(date)) |>
    pivot_longer(Constant:CMA, names_to = "factor", values_to = "premium") |>
    filter(factor != "Constant") |>
    group_by(date, factor) |>
    mutate(D = (aic - min(aic)),
           year = year(date),
           weight_freq = if_else(is.na(D), 0, exp(-D/2) / sum(exp(-D/2), na.rm = T)),
           b_freq = sum(weight_freq * premium, na.rm = T)) |>
    group_by(year, factor) |>
    summarise(s_freq = sum(weight_freq / 12 * sqrt(sd^2+(b_freq-premium)^2), na.rm = T),
              n = n(),
              b_freq = mean(b_freq),
              b_up = b_freq + 2.58*s_freq/sqrt(n),
              b_down = b_freq - 2.58*s_freq/sqrt(n)) |>
    ggplot(aes(x = year, y = b_freq)) + 
    geom_ribbon(aes(ymin = b_down, ymax = b_up), color = "grey", alpha = 0.4) + 
    geom_line(linewidth = 0.4) + 
    facet_wrap(vars(factor), ncol = 2) + theme_bw() +
    theme(legend.position = c(0.75,0.15),
          panel.grid.major.x = element_blank(),
          panel.grid.minor = element_blank(),
          strip.text = element_text(face = "bold"),
          strip.background = element_rect(fill = "#E1EFFD", color = "white"),
          legend.title = element_text(face = "bold"),
          axis.title = element_blank()) +
    ylim(-0.05,0.1)

5 Subsample analysis

We take a look a the two chronological halves of the samples and compare the average premia.

Show the code

library(ggrepel)
sep_date <- "1998-06-15"
output |>
    mutate(period = if_else(date < sep_date, "first_period", "second_period")) |>
    pivot_longer(Mkt_RF:CMA, values_to = "value", names_to = "factor") |>
    group_by(period, factor, reg_type) |>
    summarise(r = mean(value)) |>
    pivot_wider(names_from = period, values_from = r) |>
    ggplot(aes(x = first_period, y = second_period)) + geom_smooth(method = "lm", se = F, color = "grey") +
    #geom_smooth(method = "lm", se = F, aes(color = reg_type)) + 
    geom_point(aes(color = reg_type)) +
    theme_classic() + geom_vline(xintercept = 0, linewidth = 0.3) + geom_hline(yintercept = 0, linewidth = 0.3) + 
    geom_text_repel(aes(label = factor)) +
    xlab("first period") + ylab("second period") +
    theme(axis.title = element_text(face = "bold"),
          legend.position = c(0.92,0.8),
          legend.title = element_text(face = "bold")) + 
    scale_color_manual(values = c("#E62D70", "#5485E7", "#F8C509"),
                       name = "regression\n    type",
                       label = c("dyn. short", "dyn. long", "static"))

Show the code

ggsave("FM_comparison.pdf", width = 8, height = 4)

Then the alternatives for the first pass.

Show the code

output |>
    mutate(asset_file = case_when(asset_file == "data_yahoo" ~ "single stocks", .default = asset_file)) |>
    pivot_longer(Mkt_RF:CMA, values_to = "value", names_to = "factor") |>
    group_by(freq, reg_type, winso_1, winso_2, asset_file, weight, factor) |>
    summarise(value = mean(value)) |>
    pivot_wider(names_from = reg_type, values_from = value) |>
    mutate(asset_file = asset_file |> str_replace("_", " ") |> tolower()) |>
    ggplot(aes(x = static, y = dynamic_60)) + ylab("dynamic first pass (long samples)") +
    geom_vline(xintercept = 0, linewidth = 0.4, color = "grey") + geom_hline(yintercept = 0, linewidth = 0.4, color = "grey") +
    geom_point(size = 0.9, aes(color = asset_file)) + theme_classic() + 
    geom_smooth(se = F, method = "lm", color = "black", linewidth = 0.5, linetype = 2) + 
    scale_color_viridis_d(name = "assets") + xlab("static first pass") + 
    theme(legend.position = c(0.85,0.25),
          axis.title = element_text(face = "bold"),
          legend.title = element_text(face = "bold"))

Show the code

ggsave("FM_comparison.pdf", width = 8, height = 4)

Below we look at 3 layers with 3 options and see if they have an impact on the average premium.
First, regression type, then the first winsorization threshold and finally the second one.
The first 3 columns are the average premia and the last three compute 1-2, 1-3 and 2-3 with the significance levels (simple t-tests).

Show the code

starz <- function(p){
    if(p < 0.001){"(***)"} else {
        if(p < 0.01){"(**)"} else{
            if(p<0.05){"(*)"}
        }
    }
}
t1 <- output |>
    pivot_longer(Mkt_RF:CMA, values_to = "value", names_to = "factor") |>
    select(date, freq, reg_type, winso_1, winso_2, asset_file, weight, factor, value) |>
    pivot_wider(names_from = reg_type, values_from = value) |>
    unnest(cols = c(dynamic_24, dynamic_60, static)) |>
    group_by(factor) |>
    summarise(m1 = (mean(dynamic_24, na.rm = T)*100) |> round(3),
              m2 = (mean(dynamic_60, na.rm = T)*100) |> round(3),
              m3 = (mean(static, na.rm = T)*100) |> round(3),
              t1 = paste(round(m1-m2,3), starz(t.test(dynamic_24, dynamic_60)$p.value)),
              t2 = paste(round(m1-m3,3), starz(t.test(dynamic_24, static)$p.value)),
              t3 = paste(round(m2-m3,3), starz(t.test(dynamic_60, static)$p.value)))

t2 <- output |>
    pivot_longer(Mkt_RF:CMA, values_to = "value", names_to = "factor") |>
    select(date, freq, reg_type, winso_1, winso_2, asset_file, weight, factor, value) |>
    pivot_wider(names_from = winso_1, values_from = value) |>
    unnest(cols = c(`0`, `0.01`, `0.02`)) |>
    group_by(factor) |>
    summarise(m1 = (mean(`0`, na.rm = T)*100) |> round(3),
              m2 = (mean(`0.01`, na.rm = T)*100) |> round(3),
              m3 = (mean(`0.02`, na.rm = T)*100) |> round(3),
              t1 = paste(round(m1-m2,3), starz(t.test(`0`, `0.01`)$p.value)),
              t2 = paste(round(m1-m3,3), starz(t.test(`0`, `0.02`)$p.value)),
              t3 = paste(round(m2-m3,3), starz(t.test(`0.01`, `0.02`)$p.value)))

t3 <- output |>
    pivot_longer(Mkt_RF:CMA, values_to = "value", names_to = "factor") |>
    select(date, freq, reg_type, winso_1, winso_2, asset_file, weight, factor, value) |>
    pivot_wider(names_from = winso_2, values_from = value) |>
    unnest(cols = c(`0`, `0.01`, `0.02`)) |>
    group_by(factor) |>
    summarise(m1 = (mean(`0`, na.rm = T)*100) |> round(3),
              m2 = (mean(`0.01`, na.rm = T)*100) |> round(3),
              m3 = (mean(`0.02`, na.rm = T)*100) |> round(3),
              t1 = paste(round(m1-m2,3), starz(t.test(`0`, `0.01`)$p.value)),
              t2 = paste(round(m1-m3,3), starz(t.test(`0`, `0.02`)$p.value)),
              t3 = paste(round(m2-m3,3), starz(t.test(`0.01`, `0.02`)$p.value)))
t1

factor	m1	m2	m3	t1	t2	t3
CMA	0.149	0.187	0.264	-0.038 (*)	-0.115 (***)	-0.077 (**)
HML	0.091	0.110	-0.115	-0.019	0.206 (***)	0.225 (***)
Mkt_RF	0.141	0.147	0.267	-0.006	-0.126 (**)	-0.12 (**)
RMW	0.135	0.173	0.418	-0.038 (*)	-0.283 (***)	-0.245 (***)
SMB	0.180	0.283	0.068	-0.103 (***)	0.112 (***)	0.215 (***)

Show the code

t2

factor	m1	m2	m3	t1	t2	t3
CMA	0.430	0.107	0.070	0.323 (***)	0.36 (***)	0.037
HML	-0.033	0.038	0.066	-0.071 (**)	-0.099 (***)	-0.028
Mkt_RF	0.074	0.189	0.298	-0.115 (**)	-0.224 (***)	-0.109 (*)
RMW	0.194	0.267	0.283	-0.073 (**)	-0.089 (***)	-0.016
SMB	0.202	0.146	0.173	0.056 (*)	0.029	-0.027

Show the code

t3

factor	m1	m2	m3	t1	t2	t3
CMA	0.210	0.204	0.193	0.006	0.017	0.011
HML	0.021	0.023	0.027	-0.002	-0.006	-0.004
Mkt_RF	0.150	0.194	0.223	-0.044	-0.073	-0.029
RMW	0.235	0.253	0.256	-0.018	-0.021	-0.003
SMB	0.164	0.177	0.180	-0.013	-0.016	-0.003

Show the code

library(xtable)
t1 |> bind_rows(t2) |> bind_rows(t3) |> xtable(digits = 3) |> print(include.rownames=FALSE)

6 Comparison with prior work

Fama-MacBeth long term premium: 0.0085. It is shown with the vertical dashed line below.
The other values originate from the paper Using Stocks or Portfolios in Tests of Factor Models.

The values in the rounded rectangles are the “ease-to-replicate” indicator (90% level) defined in the paper.

Show the code

library(latex2exp)
quant <- 0.9

vec_prior <- c(0.0085, 0.0114, 0.0158, 0.0173, 0.0479) # Values from prior studies

dist_FM <- output |>
    group_by(freq, reg_type, winso_1, winso_2, weight, asset_file) |>
    summarise(gamma_1 = mean(Mkt_RF)) 

# Odds of favorable outcome below
ofo <- pnorm(vec_prior, mean = mean(dist_FM$gamma_1), sd = sd(dist_FM$gamma_1))

dist_FM |>
    ggplot(aes(x  = gamma_1)) + 
    theme_bw() + 
    theme(panel.grid = element_blank(),
          axis.title.y = element_blank()) +
    geom_vline(xintercept = 0.0085, linetype = 2, linewidth = 0.8) +
    geom_vline(xintercept = 0.0114) +
    geom_vline(xintercept = 0.0158) + 
    geom_vline(xintercept = 0.0173) + 
    geom_vline(xintercept = 0.0479) +
    geom_histogram(fill = "#999999") + 
    xlab(TeX("$\\bar{\\gamma}_1$")) +
    geom_label(aes(x = vec_prior[1], y = 103, label = paste0(round((1-if_else(ofo[1]>quant, (ofo[1]-quant)/(1-quant), 0))*100),"%")), fill = "#A4D5AA") +
    geom_label(aes(x = vec_prior[2], y = 96, label = paste0(round((1-if_else(ofo[2]>quant, (ofo[2]-quant)/(1-quant), 0))*100),"%")), fill = "grey") +
    geom_label(aes(x = vec_prior[3], y = 103, label = paste0(round((1-if_else(ofo[3]>quant, (ofo[3]-quant)/(1-quant), 0))*100),"%")), fill = "grey") +
    geom_label(aes(x = vec_prior[4], y = 96, label = paste0(round((1-if_else(ofo[4]>quant, (ofo[4]-quant)/(1-quant), 0))*100),"%")), fill = "grey") +
    geom_label(aes(x = vec_prior[5], y = 103, label = paste0(round((1-if_else(ofo[5]>quant, (ofo[5]-quant)/(1-quant), 0))*100),"%")), fill = "#FF4E4E")

Show the code

# ggsave("FM_prior.pdf", width = 8, height = 4)

Lastly, an illustration of the impact on the assets on the magnitude of premia.

Show the code

output |>
    pivot_longer(Mkt_RF:CMA, values_to = "value", names_to = "factor") |>
    mutate(asset_file = case_when(asset_file == "data_yahoo" ~ "single stocks", .default = asset_file)) |>
    mutate(asset_file = asset_file |> str_replace("_", " ") |> tolower()) |>
    group_by(factor, asset_file) |>
    summarise(abs = mean(abs(value))) |>
    ggplot(aes(x = abs, y = reorder(asset_file, abs), fill = factor)) +
    geom_col(position = "dodge", alpha = 0.7) + theme_classic() +
    theme(axis.title.y = element_blank(),
          axis.title.x = element_text(face = "bold"),
          legend.title = element_text(face = "bold"),
          legend.position = c(0.8,0.5)) +
    xlab("Average absolute premia") +
    scale_fill_brewer(palette = "Spectral")

Show the code

ggsave("FM_assets.pdf", width = 8, height = 4)