This notebook generates the results of the second study in the paper
Forking paths in empirical studies available on SSRN.
The dataset for this part comes from Dacheng Xiu’s website (https://dachxiu.chicagobooth.edu/) and was slightly
altered. It is a bit heavy and can be downloaded here.
WARNING: he code for this part takes much longer,
roughly 11 hours on a 2018 MacBook Pro.
The code is also folded to ease readability. Click on
the corresponding buttons to access it.
First, we load packages & import the data.
And take a brief look.
options(future.globals.maxSize= 891289600*10)
rstudioapi::writeRStudioPreference("data_viewer_max_columns", 200L) # Allows to view up to 200 columns
## NULL
library(tidyverse) # Data wrangling package
library(readxl) # Read excel files
library(data.table) # Fast wrangling!
library(purrr) # Functional programming
library(fst) # Fast reading & writing of dataframes
library(dtplyr) # Combining dplyr & data.table
library(lubridate) # Date management
library(ggallin) # Addin to ggplot for negative log scales
library(viridis) # Cool color palette
library(patchwork) # To layout graphs
library(latex2exp) # LaTeX in graphs
library(tictoc) # CPU timing
library(furrr) # Parallel computing
library(lmtest) # HAC sd
library(sandwich) # HAC sd
data_paths <- read_fst("data/data_paths.fst") # Loading main dataset
specs <- read_excel("data/ML_supp.xlsx") # Loading additional info
data_paths[,1:9] |> head(10) |> knitr::kable() # A snapshot
permno | Date | mvel1 | beta | betasq | chmom | dolvol | idiovol | indmom |
---|---|---|---|---|---|---|---|---|
10000 | 1986-02-28 | 16100.000 | NA | NA | NA | NA | NA | 0.2111586 |
10000 | 1986-03-31 | 11960.000 | NA | NA | NA | NA | NA | 0.2624714 |
10000 | 1986-04-30 | 16330.000 | NA | NA | NA | 7.897668 | NA | 0.3269873 |
10000 | 1986-05-30 | 15172.000 | NA | NA | NA | 8.472954 | NA | 0.3479495 |
10000 | 1986-06-30 | 11793.859 | NA | NA | NA | 8.250098 | NA | 0.3572053 |
10000 | 1986-07-31 | 11734.594 | NA | NA | NA | 8.113567 | NA | 0.4227055 |
10000 | 1986-08-29 | 10786.344 | NA | NA | NA | 8.103863 | NA | 0.3973749 |
10000 | 1986-09-30 | 4148.594 | NA | NA | NA | 8.103882 | NA | 0.3032582 |
10000 | 1986-10-31 | 3911.531 | NA | NA | NA | 8.112181 | NA | 0.3860786 |
10000 | 1986-11-28 | 3002.344 | NA | NA | NA | 8.205756 | NA | 0.2561681 |
norm_unif <- function(v){ # This is a function that uniformalises a vector
v <- v %>% as.matrix()
return(ecdf(v)(v) -0.5)
}
The original dataset has strong variations in the number of
well-defined fields before 1983.
Thus, we truncate the sample before this date.
Before that, we also remove a few predictors not suited for the sorting
study (e.g., binary-valued).
data_paths <- data_paths |> filter(Date > "1983-01-01") # Stability filter
data_paths <- data_paths |> arrange(Date, permno) # Order
dates <- data_paths$Date |> unique() |> sort() # Vector of dates
features <- specs |> pull(Acronym) # Predictors / firm characteristics
features <- setdiff(features, c("age", "convind", "divi", "divo", "ms", "nincr", "ps", "rd", "rd_sale", "realestate", "securedind", "sin", "dy"))
data_paths <- data_paths |>
select(all_of(c("permno", "Date", "Return", features))) |>
ungroup()
Empirical work in the paper is modelled as chains of modules.
module_variable <- function(data, variable){
#save(variable, file = paste0(Sys.time(), "_variable.RData"))
if(!(variable %in% c("mvel1", "retvol"))){
return(data |>
dplyr::select(all_of(c("permno", "Date", "Return", "mvel1", "retvol", variable))) |>
purrr::set_names(c("permno", "Date", "Return", "mvel1", "retvol", "variable")))
}
if(variable == "mvel1"){
return(data |>
dplyr::select(all_of(c("permno", "Date", "Return", "mvel1", "retvol"))) |>
mutate(variable = mvel1))
}
if(variable == "retvol"){
return(data |>
dplyr::select(all_of(c("permno", "Date", "Return", "mvel1", "retvol"))) |>
mutate(variable = retvol))
}
}
impute <- function(v){
v[is.na(v)] <- median(v, na.rm = T)
return(v)
}
module_imputation <- function(data, imputation){
if(imputation == 0){return(data)}
if(imputation == 1){
return(
data |>
group_by(Date) |>
mutate(variable = impute(variable)) |>
ungroup()
)
}
}
module_horizon <- function(data, horizon){
vol_thresh <- 0.001 # hard-coded!
if(horizon == 1){return(data |> na.omit() |> filter(retvol > vol_thresh))}
if(horizon == 2){
return(data |>
group_by(permno) |>
mutate(Return = (Return + dplyr::lead(Return)) / horizon) |> # Returns are scaled to the monthly scale
ungroup() |>
na.omit() |>
filter(retvol > vol_thresh)
)
}
if(horizon == 3){
return(data |>
group_by(permno) |>
mutate(Return = (Return + dplyr::lead(Return) + dplyr::lead(Return, 2)) / horizon) |>
ungroup() |>
na.omit() |>
filter(retvol > vol_thresh)
)
}
}
module_start <- function(data, start_quantile){
L <- nrow(data)
data[ceiling(L*start_quantile):L, ]
}
module_stop <- function(data, stop_quantile){
L <- nrow(data)
data[1:ceiling(L*stop_quantile), ]
}
module_thresh_type <- function(data, thresh_type){
type <- substr(thresh_type,(nchar(thresh_type)-1),nchar(thresh_type)) # Last 2 letters
thresh <- substr(thresh_type, 1, (nchar(thresh_type)-3)) |> as.numeric()
if(type == "EW"){
z <- data |>
na.omit() |>
group_by(Date) |>
mutate(
tm = quantile(variable, thresh, na.rm = T),
tp = quantile(variable, 1-thresh, na.rm = T)
)
z$type <- (z$variable <= z$tm)
z$type <- z$type - (z$variable >= z$tp)
z <- z[z$type != 0, ]
z$weight <- z$type # CHECK !!!!!!
z <- z |>
pivot_wider(names_from = "type", values_from = "weight") |>
group_by(Date) |>
mutate(long = if_else(is.na(`1`), 0, `1`),
long = long / sum(long),
short = if_else(is.na(`-1`), 0, `-1`),
short = short / sum(abs(short)),
position = long + short) |>
ungroup() |>
mutate(scale = length(unique(Date))/sum(position^2),
position2 = position * scale)
fit <- lm(Return ~ position2 - 1, data = z)
return(data.frame(m = coef(fit),
sd = summary(fit)$coefficients[, 2],
sd_hac = coeftest(fit, vcov. = NeweyWest(fit))[1,2],
vol = z |> group_by(Date) |> summarise(r = sum(Return*position)) |> pull(r) |> sd(),
n = nrow(z),
n_dates = length(unique(z$Date)),
aic = AIC(fit)))
}
if(type == "VW"){
z <- data |>
filter(is.finite(mvel1), mvel1 > 0) |> # CHECK !!!!
na.omit() |>
group_by(Date) |>
mutate(
tm = quantile(variable, thresh, na.rm = T),
tp = quantile(variable, 1-thresh, na.rm = T)
)
z$type <- (z$variable <= z$tm)
z$type <- z$type - (z$variable >= z$tp)
z <- z[z$type != 0, ]
z$weight <- z$type*z$mvel1 # CHECK !!!!!!
z <- z |>
pivot_wider(names_from = "type", values_from = "weight") |>
group_by(Date) |>
mutate(long = if_else(is.na(`1`), 0, `1`),
long = long / sum(long),
short = if_else(is.na(`-1`), 0, `-1`),
short = short / sum(abs(short)),
position = long + short) |>
ungroup() |>
mutate(scale = length(unique(Date))/sum(position^2),
position2 = position * scale)
fit <- lm(Return ~ position2 - 1, data = z)
return(data.frame(m = coef(fit),
sd = summary(fit)$coefficients[, 2],
sd_hac = coeftest(fit, vcov. = NeweyWest(fit))[1,2],
vol = z |> group_by(Date) |> summarise(r = sum(Return*position)) |> pull(r) |> sd(),
n = nrow(z),
n_dates = length(unique(z$Date)),
aic = AIC(fit)))
}
if(type == "IW"){
z <- data |>
filter(is.finite(retvol), retvol > 0) |> # CHECK !!!!
na.omit() |>
group_by(Date) |>
mutate(
tm = quantile(variable, thresh, na.rm = T),
tp = quantile(variable, 1-thresh, na.rm = T)
)
z$type <- (z$variable <= z$tm)
z$type <- z$type - (z$variable >= z$tp)
z <- z[z$type != 0, ]
z$weight <- z$type/z$retvol # CHECK !!!!!!
z <- z |>
pivot_wider(names_from = "type", values_from = "weight") |>
group_by(Date) |>
mutate(long = if_else(is.na(`1`), 0, `1`),
long = long / sum(long),
short = if_else(is.na(`-1`), 0, `-1`),
short = short / sum(abs(short)),
position = long + short) |>
ungroup() |>
mutate(scale = length(unique(Date))/sum(position^2),
position2 = position * scale)
fit <- lm(Return ~ position2 - 1, data = z)
return(data.frame(m = coef(fit),
sd = summary(fit)$coefficients[, 2],
sd_hac = coeftest(fit, vcov. = NeweyWest(fit))[1,2],
vol = z |> group_by(Date) |> summarise(r = sum(Return*position)) |> pull(r) |> sd(),
n = nrow(z),
n_dates = length(unique(z$Date)),
aic = AIC(fit)))
}
if(type == "CW"){
z <- data |>
na.omit() |>
group_by(Date) |>
mutate(variable = norm_unif(variable),
sign = sign(variable + rnorm(1)/10^6)) |>
group_by(Date, sign) |>
mutate(position = variable / sum(abs(variable)),
scale = length(unique(Date))/sum(position^2),
position2 = position * scale)
fit <- lm(Return ~ position2 - 1, data = z)
return(data.frame(m = coef(fit),
sd = summary(fit)$coefficients[, 2],
sd_hac = coeftest(fit, vcov. = NeweyWest(fit))[1,2],
vol = z |> group_by(Date) |> summarise(r = sum(Return*position)) |> pull(r) |> sd(),
n = nrow(z),
n_dates = length(unique(z$Date)),
aic = AIC(fit)))
}
}
A test below. There are 7 outputs: the mean return, the standard deviation of the estimated average return (IID and HAC), the vol (standard deviation of returns), the sample size, the number of dates and the AIC of the regression.
# A first test.
data_paths |>
module_variable("absacc") |>
module_imputation(0) |>
module_horizon(1) |>
module_start(1/3) |>
module_stop(2/3) |>
module_thresh_type("0.10_EW")
## m sd sd_hac vol n n_dates
## position2 0.0004014327 0.0009473093 0.001917562 0.05446198 191078 187
## aic
## position2 -87706.41
The aggregation (and a test).
module_aggregator <- function(variable = variable,
imputation = imputation,
horizon = horizon,
start_quantile = start_quantile,
stop_quantile = stop_quantile,
thresh_type = thresh_type){
data_paths |>
module_variable(variable = variable) |>
module_imputation(imputation = imputation) |>
module_horizon(horizon = horizon) |>
module_start(start_quantile = start_quantile) |>
module_stop(stop_quantile = stop_quantile) |>
module_thresh_type(thresh_type = thresh_type)
}
# Test!
module_aggregator(variable = "absacc",
imputation = 0,
horizon = 1,
start_quantile = 0,
stop_quantile = 1/3,
thresh_type = "0.10_EW")
## m sd sd_hac vol n n_dates
## position2 0.006979506 0.00090491 0.001133974 0.02614822 143350 168
## aic
## position2 -102327.5
Surprisingly, we discovered that slicing the parallelization improves
its speed, hence the slightly heavy loop below.
Running the future_map() function over 44K parameters was too slow (and
penalizing upon errors).
Below, we specify the layer options and launch the spanning of all
paths. For each anomaly, one output file is created with the
“*_results_fork.RData*” suffix.
The results for the CW weighting scheme (characteristics-weighted) are
run separately with the following parameters:
thresh <- 0.5
type <- “CW”
imputation <- c(0,1)
horizon <- 1:3
start_quantile <- c(0, 1/3, 2/3)
stop_quantile <- c(1/3, 2/3, 1)
thresh <- c(0.1, 0.15, 0.2, 0.25, 0.3)
type <- c("EW", "VW", "IW", "CW")
for(i in 1:length(features)){ # PB 63
if(i%%5==0){print(i)}
variable <- features[i]
pars <- expand_grid(variable, imputation, horizon, start_quantile, stop_quantile, thresh, type) |>
filter(stop_quantile > start_quantile,
!(type == "CW" & thresh == 0.1), # Remove unnecessary combinations
!(type == "CW" & thresh == 0.15),
!(type == "CW" & thresh == 0.2),
!(type == "CW" & thresh == 0.25))
pars2 <- pars |> mutate(thresh_type = paste0(thresh, "_", type)) |>
select(-thresh, -type)
plan(multisession, workers = 5) # Set the number of cores
if(i%%10==0){tictoc::tic()} # Launch timer
output <- future_pmap_dfr(pars2, module_aggregator) # Parallel routine
if(i%%10==0){tictoc::toc()} # Stop timer
results <- cbind(pars, output) # Bind parameters to results
colnames(results)[8:ncol(results)] <- c("avg_return", "sd", "sd_hac", "vol", "n", "n_dates", "aic")
save(results, file = paste0(variable, "_results_fork.RData"))
}
Then, all the output files must be transferred in a separate folder within the main folder. Below, we call it “output”.
results_full <- c()
filenames <- list.files("output_ols", pattern="*.RData", full.names = TRUE)
for(j in 1:length(filenames)){
load(filenames[j])
results_full <- bind_rows(results_full, results)
}
results_full <- results_full |> data.frame() |> remove_rownames()
#save(results_full, file = "results_full.RData")
The two numbers below are the average hacking intervals for robustness checks versus all paths.
library(tidyverse)
load("results_full.RData")
res_baseline <- results_full |>
mutate(t = avg_return/vol*sqrt(n_dates)) |>
group_by(variable) |>
mutate(m = median(t),
t = if_else(m>0, t, -t),
m = median(t)) |>
filter(imputation == 1,
horizon == 1,
start_quantile == 0,
stop_quantile == 1,
thresh == 0.2,
type == "EW")
res_robust <- results_full |>
mutate(t = avg_return/vol*sqrt(n_dates)) |>
group_by(variable) |>
mutate(m = median(t),
t = if_else(m>0, t, -t),
m = median(t)) |>
#(imputation == 1, horizon == 1, start_quantile == 0, stop_quantile == 1, thresh == 0.2, type == "EW") |
filter( (horizon == 1 & start_quantile == 0 & stop_quantile == 1 & thresh == 0.2 & type == "EW") |
(imputation == 1 & start_quantile == 0 & stop_quantile == 1 & thresh == 0.2 & type == "EW") |
(imputation == 1 & horizon == 1 & stop_quantile == 1 & thresh == 0.2 & type == "EW") |
(imputation == 1 & horizon == 1 & start_quantile == 0 & thresh == 0.2 & type == "EW") |
(imputation == 1 & horizon == 1 & start_quantile == 0 & stop_quantile == 1 & type == "EW") |
(imputation == 1 & horizon == 1 & start_quantile == 0 & stop_quantile == 1 & thresh == 0.2)
) |>
group_by(variable) |>
mutate(min = min(t),
max = max(t),
hackint = max - min) |>
ungroup()
res_robust |> summarise(mean(hackint))
## # A tibble: 1 × 1
## `mean(hackint)`
## <dbl>
## 1 4.08
results_full |>
mutate(t = avg_return/vol*sqrt(n_dates)) |>
group_by(variable) |>
mutate(min = min(t),
max = max(t),
hackint = max - min) |>
ungroup() |>
summarise(mean(hackint))
## # A tibble: 1 × 1
## `mean(hackint)`
## <dbl>
## 1 11.0
Then, we show the boxplots of t-statistics for each firm characteristic.
results_full |>
mutate(t = avg_return/vol*sqrt(n_dates),
t = avg_return/sd_hac) |>
group_by(variable) |>
mutate(m = median(t),
t = if_else(m>0, t, -t),
m = median(t)) |>
ggplot(aes(y = reorder(variable, m), x = t)) +
geom_boxplot() + theme_bw() +
geom_vline(xintercept = 0, linetype = 2, color = "red") +
geom_vline(xintercept = -2.5, linetype = 3, color = "orange") +
geom_vline(xintercept = 2.5, linetype = 3, color = "orange") +
theme(axis.title = element_blank()) +
geom_point(data = res_baseline, color = "red") +
geom_point(data = res_robust, aes(x = min), color = "blue", shape = 18, size = 2) +
geom_point(data = res_robust, aes(x = max), color = "blue", shape = 18, size = 2)
#ggsave("t_stats_anom.pdf", width = 8.4, height = 10)
Same type of representation, but depending on weighting scheme.
results_full |>
mutate(t = avg_return/vol*sqrt(n_dates)) |>
group_by(variable) |>
mutate(m = median(t),
t = if_else(m>0, t, -t),
m = median(t)) |>
group_by(variable, type) |>
summarise(t = mean(t)) |>
mutate(diff = max(t) - min(t)) |>
ggplot(aes(x = t, y = reorder(variable, diff), color = type)) +
geom_line(color = "black") + theme_bw() +
geom_point() +
theme(axis.title.y = element_blank(),
legend.position = c(0.75, 0.2)) +
geom_vline(xintercept = 2.5, linetype = 2) +
scale_color_manual(name = "weighting", values = c("#F74646", "#FFA22B", "#5E51E0", "#000000"),
labels = c("Characteristics-based", "Uniform", "Inverse volatility", "Value (market cap)")) +
xlab("average t-statistic")
#ggsave("t_weighting.pdf", width = 8.4, height = 10)
Let’s see how that translates in terms of cdf.
results_full |>
mutate(t = avg_return/vol*sqrt(n_dates)) |>
group_by(variable) |>
mutate(m = median(t),
t = if_else(m>0, t, -t) ) |>
ungroup() |>
group_by(type) |>
mutate(v = ecdf(t)(t)) |>
ggplot(aes(x = t, y = v, color = type)) + #geom_histogram(position = "dodge") +
geom_line() + theme_bw() +
scale_color_manual(values = c("#FE8C1A", "#4F99D0", "#CC2222", "#000000"), name = "weighting") +
theme(legend.position = c(0.25,0.7),
legend.title = element_text(face = "bold"),
axis.title.y = element_blank()) +
xlab("t-statistic") +
geom_vline(xintercept = 7.6, linetype = 2) +
annotate("text", x = 9, y = 0.7, label = TeX("max t-stat \n for VW \n portfolios"))
ggsave("t_cdf.pdf", width = 8, height = 4)
The next step is to extract the histogram values on the [0,1/2] interval. Because we work with bins with 0.1 width, this makes only 5 values.
n_i <- function(v, bins){ # Function that extracts the values
temp <- hist(v, breaks = bins, plot = F)
temp$counts
}
n <- 10
bins <- (0:n)/n
table_counts <- results_full |>
mutate(t = sqrt(n_dates)*avg_return/vol) |>
group_by(variable) |>
mutate(m = median(t),
t = if_else(m>0, t, -t) ) |>
ungroup() |>
mutate(t = 1-pnorm(t)) |>
group_by(variable) |>
summarise(counts = n_i(t, bins)) |>
mutate(quantile = row_number()) |>
pivot_wider(names_from = variable, values_from = counts) |>
data.matrix()
crit_1 <- 4
table_counts <- table_counts + 1 # to avoid issues when counts are zero
crit <- (lead(table_counts) / table_counts ) / matrix(rep(1:nrow(table_counts), ncol(table_counts)),
nrow = nrow(table_counts),
ncol = ncol(table_counts),
byrow = F)
criteria <- apply(crit[1:crit_1, -1], 2, mean, na.rm = T) |>
t() |>
as_tibble(rownames = NA) |>
pivot_longer(cols = everything(), names_to = "variable", values_to = "crit") |>
mutate(criterion = if_else(crit < 0.25 , "unnecessary",
if_else(crit > 0.4, "problematic", "possible")),
criterion = recode_factor(criterion,
`unnecessary` = "unnecessary",
`possible` = "possible",
`problematic` = "problematic",
.ordered = T
))
criteria |> group_by(criterion) |> count()
## # A tibble: 3 × 2
## # Groups: criterion [3]
## criterion n
## <ord> <int>
## 1 unnecessary 38
## 2 possible 28
## 3 problematic 15
Let’s finally have a look at the distribution of p-values, sorting variable by sorting variable.
results_full |>
mutate(t = sqrt(n_dates)*avg_return/vol) |>
group_by(variable) |>
mutate(m = median(t),
t = if_else(m>0, t, -t) ) |>
ungroup() |>
mutate(t = 1-pnorm(t)) |>
left_join(criteria, by = "variable") |>
ggplot(aes(x = t, fill = criterion)) + geom_vline(xintercept = 0.5, linetype = 2) +
geom_histogram(breaks = bins) +
facet_wrap(vars(variable), ncol = 8, scales = "free_y") +
theme_bw() +
scale_x_continuous(breaks = c(0, 0.5, 1)) +
scale_fill_manual(values = c("#115782", "#FF9D36", "#E7002E"), name = "vicious p-hacking is:") +
theme(axis.title = element_blank(),
legend.position = c(0.5, 0.03),
legend.title = element_text(face = "bold"),
strip.text = element_text(size = 6),
strip.background = element_rect(fill = "#E1EBF0")) +
guides(fill = guide_legend(nrow = 1, byrow = TRUE))
ggsave("pval_dist.pdf", width = 8, height = 10)
library(ggrepel)
library(patchwork)
res_per1 <- results_full |>
mutate(period = case_when(
start_quantile == 0 & stop_quantile == 1/3 ~ "Period 1",
start_quantile == 1/3 & stop_quantile == 2/3 ~ "Period 2",
start_quantile == 2/3 & stop_quantile == 1 ~ "Period 3",
.default = "Other"
)) |>
group_by(variable, period) |>
summarise(avg_return = mean(avg_return)) |>
ungroup() #|> filter(avg_return > -0.1)
c1 <- (res_per1 |>
filter(period %in% c("Period 1", "Period 2")) |>
pivot_wider(names_from = period, values_from = avg_return) |>
select(-1) |>
data.matrix() |>
cor(use = "complete.obs"))[1,2]
g1 <- res_per1 |>
pivot_wider(names_from = "period", values_from = "avg_return") %>% {
ggplot(., aes(x = `Period 1`, y = `Period 2`)) + geom_smooth(method = "lm", se = F, color = "#B0D1FA") +
geom_hline(yintercept = 0, color = "#CDCDCD", linetype = 2) +
geom_vline(xintercept = 0, color = "#CDCDCD", linetype = 2) + geom_point() +
theme_bw() + theme(axis.title = element_text(face = "bold"),
panel.grid = element_blank()) +
geom_point(data = filter(., abs(`Period 1`) > 0.01), color = "red") +
geom_text_repel(data = filter(., abs(`Period 1`) > 0.01), aes(label = variable)) +
annotate("text", label = paste("Correlation =", round(c1,3)), x = 0.01, y = -0.005, color = "#568FCB", fontface = 2)
}
res_per2 <- results_full |>
mutate(period = case_when(
start_quantile == 0 & stop_quantile == 1/3 ~ "Period 1",
start_quantile == 1/3 & stop_quantile == 2/3 ~ "Period 2",
start_quantile == 2/3 & stop_quantile == 1 ~ "Period 3",
.default = "Other"
)) |>
group_by(variable, period) |>
summarise(avg_return = mean(avg_return)) |>
ungroup() #|> filter(avg_return > -0.1)
c2 <- (res_per2 |>
filter(period %in% c("Period 2", "Period 3")) |>
pivot_wider(names_from = period, values_from = avg_return) |>
select(-1) |>
data.matrix() |>
cor(use = "complete.obs"))[1,2]
g2 <- res_per2 |>
pivot_wider(names_from = "period", values_from = "avg_return") %>% {
ggplot(., aes(x = `Period 2`, y = `Period 3`)) + geom_smooth(method = "lm", se = F, color = "#B0D1FA") +
geom_hline(yintercept = 0, color = "#CDCDCD", linetype = 2) +
geom_vline(xintercept = 0, color = "#CDCDCD", linetype = 2) + geom_point() +
theme_bw() + theme(axis.title = element_text(face = "bold"),
panel.grid = element_blank()) +
geom_point(data = filter(., `Period 2` > 0.004 | `Period 2` < (-0.005)), color = "red") +
geom_text_repel(data = filter(., `Period 2` > 0.004 | `Period 2` < (-0.005)), aes(label = variable)) +
annotate("text", label = paste("Correlation =", round(c2,3)), x = 0.003, y = -0.005, color = "#568FCB", fontface = 2)
}
g1 / g2
ggsave("oos.pdf", width = 8, height = 6)
A focus on the acc characteristic
thresh <- 0.2
data_paths |>
select(permno, Date, Return, acc, mvel1, retvol) |>
mutate(variable = acc) |>
na.omit() |>
group_by(Date) |>
mutate(type = case_when(
variable <= quantile(variable, thresh, na.rm = T) ~ "Short",
variable >= quantile(variable, 1 - thresh, na.rm = T) ~ "Long")
) |>
# mutate(type = if_else(variable < quantile(variable, thresh, na.rm = T), "Short",
# if_else(variable > quantile(variable, 1 - thresh, na.rm = T), "Long", "Neutral"))) |>
pivot_wider(names_from = "type", values_from = "Return") |>
mutate(is_Long = is.finite(Long),
is_Short = is.finite(Short)) |>
group_by(Date) |>
summarize(ret = sum(Long/retvol, na.rm = T)/sum(is_Long/retvol, na.rm = T) -
sum(Short/retvol, na.rm = T)/sum(is_Short/retvol, na.rm = T)) |>
summarise(m = mean(ret, na.rm = T),
s = sd(ret, na.rm = T),
n = length(ret))
## # A tibble: 1 × 3
## m s n
## <dbl> <dbl> <int>
## 1 -0.00254 0.0227 467
thresh <- 0.2
z <- data_paths |>
select(permno, Date, Return, absacc, mvel1, retvol) |>
filter(is.finite(retvol), retvol > 0) |> # CHECK !!!!
mutate(variable = absacc) |>
na.omit() |>
group_by(Date) |>
mutate(
tm = quantile(variable, thresh, na.rm = T),
tp = quantile(variable, 1-thresh, na.rm = T)
) |>
ungroup()
z$type <- (z$variable >= z$tp)
z$type <- z$type - (z$variable <= z$tm)
z <- z[z$type != 0, ]
z$weight <- z$type#/z$retvol # CHECK !!!!!!
z <- z |>
pivot_wider(names_from = "type", values_from = "weight") |>
group_by(Date) |>
mutate(long = if_else(is.na(`1`), 0, `1`),
long = long / sum(long),
short = if_else(is.na(`-1`), 0, `-1`),
short = short / sum(abs(short)),
position = long + short) |>
ungroup() |>
mutate(scale = length(unique(Date))/sum(position^2),
position2 = position * scale)
fit <- lm(Return ~ position2 - 1, data = z)
# z <- data_paths |>
# select(all_of(c("permno", "Date", "Return", variable))) |>
# na.omit() |>
# group_by(Date) |>
# mutate(variable = norm_unif(!!sym(variable)),
# sign = sign(variable + rnorm(1)/10^6)) |>
# group_by(Date, sign) |>
# mutate(position = variable / sum(abs(variable)),
# scale = length(unique(Date))/sum(position^2),
# position2 = position * scale)
In this section, we use the data from the Open Source Asset Pricing
paper.
We are then able to compare our results to those of the original
studies.
The colors code the ease to replicate indicator defined in the paper. It measures if the values obtained in the published articles can easily be attained by some paths which we generate.
published_anomalies <- read_csv("https://raw.githubusercontent.com/OpenSourceAP/CrossSection/master/SignalDoc.csv")
specs2 <- specs[,c(2,4,5)]
colnames(specs2) <- c("variable", "author", "journal")
specs2 <- specs2 |>
separate(journal, sep = ", ", into = c("journal", "year")) |>
separate(author, sep = " ", into = c("first_auth", "other_auth")) |>
separate(journal, sep = ",", into = c("year", "journal")) |>
mutate(first_auth = first_auth |> str_remove(",")) |>
mutate(id = paste(first_auth, year)) |>
left_join(
published_anomalies |>
rename(t_stat = `T-Stat`) |>
select(Authors, Year, Return, t_stat) |>
separate(Authors, sep = " ", into = c("first_auth", "other_auth")) |>
mutate(first_auth = first_auth |> str_remove(",")) |>
mutate(id = paste(first_auth, Year)),
by = "id") |>
filter(is.finite(Return), is.finite(t_stat)) |>
mutate(Return = Return / 100) |>
select(variable, Return, t_stat, Year, id)
grouped_results <- results_full |>
group_by(variable) |>
filter(is.finite(aic), is.finite(avg_return)) |>
mutate(D = (aic - min(aic)),
weight = if_else(is.na(D), 0, exp(-D/2) /sum(exp(-D/2), na.rm = T)),
b = sum(weight * avg_return, na.rm = T)) |>
summarise(s = sum(weight * sqrt(sd_hac^2+(b-avg_return)^2), na.rm = T),
b = abs(mean(b)),
n_paths = n()) |> # CAREFUL !!!
left_join(specs2, by = "variable") |>
na.omit()
quant <- 0.9
res_plot <- results_full |>
left_join(grouped_results, by = "variable") |>
filter(is.finite(Return)) |>
group_by(variable) |>
mutate(m = mean(avg_return),
sd = sd(avg_return),
avg_return = if_else(m > 0, avg_return, -avg_return),
m = mean(avg_return),
q = pnorm(Return, mean = m, sd = sd),
EtR = 1-if_else(q>quant, (q-quant)/(1-quant), 0),
min_q = min(q)) |>
filter(q == min_q)
res_plot |>
ggplot(aes(x = avg_return, y = reorder(variable, b))) + geom_vline(xintercept = 0, linetype = 2) +
geom_point(color = "#AAAAAA", size = 0.7) + theme_bw() +
#geom_errorbar(aes(xmin = m - 2.58*sd/sqrt(n), xmax = m + 2.58*sd/sqrt(n)), color = "#2266DD") +
geom_errorbar(aes(xmin = b - 2.58*s/sqrt(n_paths), xmax = b + 2.58*s/sqrt(n_paths)), size = 0.3) +
geom_point(aes( x = Return, y = variable), color = "red", shape = 17) +
xlab("Coefficient (average return)") +
geom_label(data = res_plot |> group_by(variable) |> summarise(q = mean(EtR)), size = 3.5,
label.padding = unit(0.12, "lines"),
aes(y = variable, x = 0.028, label = paste0(round(100*q, 1), "%"), fill = q), alpha = 0.4) +
theme(axis.title.y = element_blank(),
legend.position = "none",
axis.title.x = element_text(face = "bold")) +
scale_fill_distiller(palette = "RdYlGn", direction = 1)
ggsave("comparison.pdf", width = 8, height = 5)
results_full |>
ggplot(aes(x = avg_return/vol)) + geom_histogram() +
facet_wrap(vars(variable), scales = "free") + theme_bw()
We plot the distribution of the average return from the paths, along
with Gaussian and Student fit.
Lastly, we locate the value of the original study, much to the right of
the histogram.
results_acc <- results_full |>
filter(variable == "acc") |>
mutate(D = (aic - min(aic)),
weight = if_else(is.na(D), 0, exp(-D/2) /sum(exp(-D/2), na.rm = T)),
b = sum(weight * avg_return, na.rm = T),
s = sum(weight * sqrt(sd_hac^2+(b-avg_return)^2), na.rm = T))
b <- results_acc$b[1]
s <- results_acc$s[1]
stu <- function(x){
dt((x-mean(results_acc$avg_return))/sd(results_acc$avg_return), df = 3)/sd(results_acc$avg_return)
}
#pt((0.008666667-mean(results_acc$avg_return))/sd(results_acc$avg_return), df = 3) # Student quantile
#qnorm(0.9, mean(results_acc$avg_return), sd(results_acc$avg_return))
#pnorm(0.008666667, mean(results_acc$avg_return), sd(results_acc$avg_return))
results_acc |>
ggplot(aes(x = avg_return)) + geom_vline(xintercept = 0, color = "#CCCCCC", linetype = 2) +
geom_histogram(aes(y = ..density..), alpha = 0.4) + theme_bw() +
xlab("Average return") + xlim(-0.0027, 0.009) +
theme(axis.title.x = element_text(face = "bold"),
axis.title.y = element_blank(),
panel.grid = element_blank()) +
geom_function(fun = dnorm,
args = list(mean = mean(results_acc$avg_return),
sd = sd(results_acc$avg_return)),
color = "#2288DD"
) +
geom_vline(xintercept = qnorm(0.9, mean(results_acc$avg_return), sd(results_acc$avg_return)), color = "#999999") +
geom_function(fun = stu) +
geom_segment(x = 0.008666667, xend = 0.008666667, y = 60, yend = 5,
arrow = arrow(length = unit(0.2, "cm")), color = "red") +
geom_label(aes(x = 0.0058, y = 230), label = "Gaussian \n 90% \n threshold", size = 3.5) +
#geom_segment(x = b-2.58*s/sqrt(576), xend = b+2.58*s/sqrt(576), y = 250, yend = 250) +
annotate("text", x = 0.0086, y = 80, label = "original \n study", color = "red", fontface = 2) +
annotate("text", x = 0.001, y = 162, label = "fitted \n Gaussian", color = "#2288DD", fontface = 2) +
annotate("text", x = -0.001, y = 60, label = "fitted \n Student", color = "#000000", fontface = 2)
# ggsave("acc.pdf", width = 8, height = 4)