library(xgboost)
library(tidyverse)
load("data_mat_rk.RData")
tick <- levels(data_mat_rk$CompanyRelatedID)        # IDs of stocks
dates <- levels(as.factor(data_mat_rk$DataDate))    # List of dates
# N <- length(tick)                   
oos <- "2007-12-06"
dates_oos <- dates[dates>oos]                       # Out-of-sample dates
Tt <- length(dates_oos)                             # Number of dates
nb_port <- 6                                        # Number of portfolios
q_port <- 1 / nb_port                     
q_port <- seq(0, 1, by  = q_port)                   # Quantiles for portfolio construction
r_port <- matrix(0, ncol = nb_port, nrow = Tt)      # Portfolio returns
offset_date <- 0                                    # 0 => is equivalent to 5 years
max_depth <- 5                                      # Maximum nb of levels for the trees
q <- 0.2                                            # Filtering intensity: be careful, inverses for bulk filter
ptm <- proc.time()                                  # Initiate CPU time
for(t in 1:Tt){
    if(t%%12==0){print(dates_oos[t])}
    x <- filter(data_mat_rk, 
                DataDate < as.Date(dates_oos[t],origin="1970-01-01")-365,  # No forward-looking bias!
                DataDate > dates[t + offset_date])                         # Rolling window
    # The line below determines the filtering intensity. Omit it to remove the filter.
    x <- filter(x, R12M_F >= quantile(x$R12M_F,1-q) | R12M_F<=quantile(x$R12M_F,q))      # Filtered data: extreme
    # x <- filter(x, R12M_F >= quantile(x$R12M_F,q) & R12M_F <= quantile(x$R12M_F,1-q))  # Filtered data: bulk
    train_data <- dplyr::select(x,-CompanyRelatedID,-DataDate,-R1M_F,-R12M_F) %>% data.matrix()
    train_label <- dplyr::select(x, R12M_F) %>% data.matrix()                            # Training label
    train_matrix <- xgb.DMatrix(data = train_data, label = train_label)                  # Full training data
    fit <- xgb.train(eta = 0.7,                    # Learning rate, 0.7 for light, 0.5 for intermed. 0.3 for deep
                     max_depth = max_depth,        # Maximum depth
                     gamma = 0,                    # Penalization
                     lambda = 1,                   # Penalization
                     objective = "reg:linear",     # Objective function
                     data = train_matrix,          # Data source
                     nrounds = 25)                 # Nb of trees, 25 for light, 50 for intermed., 100 for deep
    y <- filter(data_mat_rk, DataDate == dates_oos[t]) # Current values
    test_data <- dplyr::select(y,-CompanyRelatedID,-DataDate,-R1M_F,-R12M_F) %>% data.matrix()
    test_label <- dplyr::select(y, R1M_F) %>% data.matrix()          
    test_matrix <- xgb.DMatrix(data = test_data, label = test_label) # Full testing data
    pred <- predict(fit, test_matrix)                                # Prediction
    z <- filter(data_mat_rk, DataDate == dates_oos[t])               # Current data
    for(j in 2:length(q_port)){ # Loop creating the portfolios (could use quantcut)
        ind <- which(pred <= quantile(pred, q_port[j]) & pred >= quantile(pred,q_port[j-1]))
        r_port[t,j-1] <- mean(z$R1M_F[ind]) # Aggregating stock returns into portfolios
    }
}
[1] "2008-11-30"
[1] "2009-11-30"
[1] "2010-11-30"
[1] "2011-11-30"
[1] "2012-11-30"
[1] "2013-11-30"
[1] "2014-11-30"
[1] "2015-11-30"
proc.time() - ptm                           # CPU time difference
   user  system elapsed 
359.670  18.167 381.805 
colMeans(r_port)                            # Average of portfolio returns
[1] 0.007391463 0.007728983 0.007502411 0.007108819 0.010058449 0.014720245
t.test(r_port[,6]-r_port[,1])               # t-test on difference of extreme portfolios

    One Sample t-test

data:  r_port[, 6] - r_port[, 1]
t = 2.2087, df = 101, p-value = 0.02945
alternative hypothesis: true mean is not equal to 0
95 percent confidence interval:
 0.0007465244 0.0139110387
sample estimates:
  mean of x 
0.007328782 
ew <- data_mat_rk %>%                       # Benchmark: EW portfolio
    group_by(DataDate) %>%
    summarise(Ret = mean(R1M_F))
mean(ew$Ret)                                # Average return of benchmark
[1] 0.01126462
LS0tCnRpdGxlOiAiUG9ydGZvbGlvIGJhY2t0ZXN0cyIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3IsIG1lc3NhZ2UgPSBGQUxTRSwgd2FybmluZyA9IEZBTFNFfQpsaWJyYXJ5KHhnYm9vc3QpCmxpYnJhcnkodGlkeXZlcnNlKQpsb2FkKCJkYXRhX21hdF9yay5SRGF0YSIpCgp0aWNrIDwtIGxldmVscyhkYXRhX21hdF9yayRDb21wYW55UmVsYXRlZElEKSAgICAgICAgIyBJRHMgb2Ygc3RvY2tzCmRhdGVzIDwtIGxldmVscyhhcy5mYWN0b3IoZGF0YV9tYXRfcmskRGF0YURhdGUpKSAgICAjIExpc3Qgb2YgZGF0ZXMKIyBOIDwtIGxlbmd0aCh0aWNrKSAgICAgICAgICAgICAgICAgICAKCm9vcyA8LSAiMjAwNy0xMi0wNiIKZGF0ZXNfb29zIDwtIGRhdGVzW2RhdGVzPm9vc10gICAgICAgICAgICAgICAgICAgICAgICMgT3V0LW9mLXNhbXBsZSBkYXRlcwpUdCA8LSBsZW5ndGgoZGF0ZXNfb29zKSAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIyBOdW1iZXIgb2YgZGF0ZXMKbmJfcG9ydCA8LSA2ICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICMgTnVtYmVyIG9mIHBvcnRmb2xpb3MKcV9wb3J0IDwtIDEgLyBuYl9wb3J0ICAgICAgICAgICAgICAgICAgICAgCnFfcG9ydCA8LSBzZXEoMCwgMSwgYnkgID0gcV9wb3J0KSAgICAgICAgICAgICAgICAgICAjIFF1YW50aWxlcyBmb3IgcG9ydGZvbGlvIGNvbnN0cnVjdGlvbgpyX3BvcnQgPC0gbWF0cml4KDAsIG5jb2wgPSBuYl9wb3J0LCBucm93ID0gVHQpICAgICAgIyBQb3J0Zm9saW8gcmV0dXJucwpvZmZzZXRfZGF0ZSA8LSAwICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIyAwID0+IGlzIGVxdWl2YWxlbnQgdG8gNSB5ZWFycwptYXhfZGVwdGggPC0gNSAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIyBNYXhpbXVtIG5iIG9mIGxldmVscyBmb3IgdGhlIHRyZWVzCnEgPC0gMC4yICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAjIEZpbHRlcmluZyBpbnRlbnNpdHk6IGJlIGNhcmVmdWwsIGludmVyc2VzIGZvciBidWxrIGZpbHRlcgoKcHRtIDwtIHByb2MudGltZSgpICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICMgSW5pdGlhdGUgQ1BVIHRpbWUKZm9yKHQgaW4gMTpUdCl7CiAgICBpZih0JSUxMj09MCl7cHJpbnQoZGF0ZXNfb29zW3RdKX0KICAgIHggPC0gZmlsdGVyKGRhdGFfbWF0X3JrLCAKICAgICAgICAgICAgICAgIERhdGFEYXRlIDwgYXMuRGF0ZShkYXRlc19vb3NbdF0sb3JpZ2luPSIxOTcwLTAxLTAxIiktMzY1LCAgIyBObyBmb3J3YXJkLWxvb2tpbmcgYmlhcyEKICAgICAgICAgICAgICAgIERhdGFEYXRlID4gZGF0ZXNbdCArIG9mZnNldF9kYXRlXSkgICAgICAgICAgICAgICAgICAgICAgICAgIyBSb2xsaW5nIHdpbmRvdwogICAgIyBUaGUgbGluZSBiZWxvdyBkZXRlcm1pbmVzIHRoZSBmaWx0ZXJpbmcgaW50ZW5zaXR5LiBPbWl0IGl0IHRvIHJlbW92ZSB0aGUgZmlsdGVyLgogICAgeCA8LSBmaWx0ZXIoeCwgUjEyTV9GID49IHF1YW50aWxlKHgkUjEyTV9GLDEtcSkgfCBSMTJNX0Y8PXF1YW50aWxlKHgkUjEyTV9GLHEpKSAgICAgICMgRmlsdGVyZWQgZGF0YTogZXh0cmVtZQogICAgIyB4IDwtIGZpbHRlcih4LCBSMTJNX0YgPj0gcXVhbnRpbGUoeCRSMTJNX0YscSkgJiBSMTJNX0YgPD0gcXVhbnRpbGUoeCRSMTJNX0YsMS1xKSkgICMgRmlsdGVyZWQgZGF0YTogYnVsawogICAgdHJhaW5fZGF0YSA8LSBkcGx5cjo6c2VsZWN0KHgsLUNvbXBhbnlSZWxhdGVkSUQsLURhdGFEYXRlLC1SMU1fRiwtUjEyTV9GKSAlPiUgZGF0YS5tYXRyaXgoKQogICAgdHJhaW5fbGFiZWwgPC0gZHBseXI6OnNlbGVjdCh4LCBSMTJNX0YpICU+JSBkYXRhLm1hdHJpeCgpICAgICAgICAgICAgICAgICAgICAgICAgICAgICMgVHJhaW5pbmcgbGFiZWwKICAgIHRyYWluX21hdHJpeCA8LSB4Z2IuRE1hdHJpeChkYXRhID0gdHJhaW5fZGF0YSwgbGFiZWwgPSB0cmFpbl9sYWJlbCkgICAgICAgICAgICAgICAgICAjIEZ1bGwgdHJhaW5pbmcgZGF0YQogICAgZml0IDwtIHhnYi50cmFpbihldGEgPSAwLjcsICAgICAgICAgICAgICAgICAgICAjIExlYXJuaW5nIHJhdGUsIDAuNyBmb3IgbGlnaHQsIDAuNSBmb3IgaW50ZXJtZWQuIDAuMyBmb3IgZGVlcAogICAgICAgICAgICAgICAgICAgICBtYXhfZGVwdGggPSBtYXhfZGVwdGgsICAgICAgICAjIE1heGltdW0gZGVwdGgKICAgICAgICAgICAgICAgICAgICAgZ2FtbWEgPSAwLCAgICAgICAgICAgICAgICAgICAgIyBQZW5hbGl6YXRpb24KICAgICAgICAgICAgICAgICAgICAgbGFtYmRhID0gMSwgICAgICAgICAgICAgICAgICAgIyBQZW5hbGl6YXRpb24KICAgICAgICAgICAgICAgICAgICAgb2JqZWN0aXZlID0gInJlZzpsaW5lYXIiLCAgICAgIyBPYmplY3RpdmUgZnVuY3Rpb24KICAgICAgICAgICAgICAgICAgICAgZGF0YSA9IHRyYWluX21hdHJpeCwgICAgICAgICAgIyBEYXRhIHNvdXJjZQogICAgICAgICAgICAgICAgICAgICBucm91bmRzID0gMjUpICAgICAgICAgICAgICAgICAjIE5iIG9mIHRyZWVzLCAyNSBmb3IgbGlnaHQsIDUwIGZvciBpbnRlcm1lZC4sIDEwMCBmb3IgZGVlcAogICAgeSA8LSBmaWx0ZXIoZGF0YV9tYXRfcmssIERhdGFEYXRlID09IGRhdGVzX29vc1t0XSkgIyBDdXJyZW50IHZhbHVlcwogICAgdGVzdF9kYXRhIDwtIGRwbHlyOjpzZWxlY3QoeSwtQ29tcGFueVJlbGF0ZWRJRCwtRGF0YURhdGUsLVIxTV9GLC1SMTJNX0YpICU+JSBkYXRhLm1hdHJpeCgpCiAgICB0ZXN0X2xhYmVsIDwtIGRwbHlyOjpzZWxlY3QoeSwgUjFNX0YpICU+JSBkYXRhLm1hdHJpeCgpICAgICAgICAgIAogICAgdGVzdF9tYXRyaXggPC0geGdiLkRNYXRyaXgoZGF0YSA9IHRlc3RfZGF0YSwgbGFiZWwgPSB0ZXN0X2xhYmVsKSAjIEZ1bGwgdGVzdGluZyBkYXRhCiAgICBwcmVkIDwtIHByZWRpY3QoZml0LCB0ZXN0X21hdHJpeCkgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICMgUHJlZGljdGlvbgogICAgeiA8LSBmaWx0ZXIoZGF0YV9tYXRfcmssIERhdGFEYXRlID09IGRhdGVzX29vc1t0XSkgICAgICAgICAgICAgICAjIEN1cnJlbnQgZGF0YQogICAgZm9yKGogaW4gMjpsZW5ndGgocV9wb3J0KSl7ICMgTG9vcCBjcmVhdGluZyB0aGUgcG9ydGZvbGlvcyAoY291bGQgdXNlIHF1YW50Y3V0KQogICAgICAgIGluZCA8LSB3aGljaChwcmVkIDw9IHF1YW50aWxlKHByZWQsIHFfcG9ydFtqXSkgJiBwcmVkID49IHF1YW50aWxlKHByZWQscV9wb3J0W2otMV0pKQogICAgICAgIHJfcG9ydFt0LGotMV0gPC0gbWVhbih6JFIxTV9GW2luZF0pICMgQWdncmVnYXRpbmcgc3RvY2sgcmV0dXJucyBpbnRvIHBvcnRmb2xpb3MKICAgIH0KfQpwcm9jLnRpbWUoKSAtIHB0bSAgICAgICAgICAgICAgICAgICAgICAgICAgICMgQ1BVIHRpbWUgZGlmZmVyZW5jZQoKY29sTWVhbnMocl9wb3J0KSAgICAgICAgICAgICAgICAgICAgICAgICAgICAjIEF2ZXJhZ2Ugb2YgcG9ydGZvbGlvIHJldHVybnMKdC50ZXN0KHJfcG9ydFssNl0tcl9wb3J0WywxXSkgICAgICAgICAgICAgICAjIHQtdGVzdCBvbiBkaWZmZXJlbmNlIG9mIGV4dHJlbWUgcG9ydGZvbGlvcwoKZXcgPC0gZGF0YV9tYXRfcmsgJT4lICAgICAgICAgICAgICAgICAgICAgICAjIEJlbmNobWFyazogRVcgcG9ydGZvbGlvCiAgICBncm91cF9ieShEYXRhRGF0ZSkgJT4lCiAgICBzdW1tYXJpc2UoUmV0ID0gbWVhbihSMU1fRikpCm1lYW4oZXckUmV0KSAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIyBBdmVyYWdlIHJldHVybiBvZiBiZW5jaG1hcmsKYGBgCgoK