Packages: make sure they are installed

library(tidyverse)
library(rpart)
library(rpart.plot)
library(gridExtra)

Toy example: setup

We run simulations and create the first graph.

sample <- 10^3                                                       # Sample size
x1 <- runif(sample)                                                  # x1 variable
x2 <- runif(sample)                                                  # x2 variable
y <- x1/3 + sin(10*x2)/4 + x2/9 - 0.12 + rnorm(sample)/10
data <- data.frame(y, x1, x2)                                        # Aggregate dataset
data %>% ggplot() +                                                  # Plot
    geom_point(aes(x = x1, y = y), color = "blue2", size = 0.3, alpha = 0.4) +
    geom_point(aes(x = x2, y = y), color = "red2", size = 0.3, alpha = 0.4) +
    stat_smooth(aes(x = x1, y = y), color = "blue2") +
    stat_smooth(aes(x = x2, y = y), color = "red2") +
    xlab("x1 (blue) or x2 (red)")

Gain analysis (pre-filter)

We compute the important indicators stemming from splits (the gains essentially).

sample <- 10^6                                                       # More points & then start again.
x1 <- runif(sample)                                                  # x1 variable
x2 <- runif(sample)                                                  # x2 variable
y <- x1/3 + sin(10*x2)/4 + x2/9 - 0.12 + rnorm(sample)/10
data <- data.frame(y, x1, x2)   
gain <- function(fit){     # Simple function
    out_fit <- fit$frame
    var10 <- out_fit$dev[1]
    var11 <- out_fit$dev[2]  
    var12 <- out_fit$dev[3]
    return((var10-var11-var12)/out_fit$n[1])
}
fit1 <- rpart(y ~ x1, data = data, maxdepth = 1)
gain(fit1) # Gain realized by first variable
[1] 0.007008279
fit2 <- rpart(y ~ x2, data = data, maxdepth = 1)
gain(fit2) # Gain realized by second variable
[1] 0.007792604

The second variable leads to the highest gain: this is the one that would be chosen.

And plot the corresponding trees.

layout(matrix(c(1,2), 1, 2, byrow = TRUE))
rpart.plot(fit1)
rpart.plot(fit2)

Tree post-filter

We show what happens if you filter the data upfront: the split changes.

q <- 0.15
sample <- 10^4                                                       # Generate again.
x1 <- runif(sample)                                                  # x1 variable
x2 <- runif(sample)                                                  # x2 variable
y <- x1/3 + sin(10*x2)/4 + x2/9 - 0.12 + rnorm(sample)/10
data <- data.frame(y, x1, x2) 
data_filter <- data %>% filter(y < quantile(y,q) | y > quantile(y, 1-q))
fit <- rpart(y ~ x1 + x2, data = data_filter, maxdepth = 1)  
rpart.plot(fit)

Graphical explanation

This one is a bit long, we keep a lot of information. We show the separations dictated by the splits via a vertical line.

fit1 <- rpart(y ~ x1, data = data_filter, maxdepth = 1)
fit2 <- rpart(y ~ x2, data = data_filter, maxdepth = 1)
split1 <- fit1$splits[fit1$splits[,3]==min(fit1$splits[,3]),4]
split2 <- fit2$splits[fit2$splits[,3]==min(fit2$splits[,3]),4]
data_filter <- data %>% filter(y < quantile(y,q) | y > quantile(y, 1-q))
g1 <- data_filter %>% 
    ggplot() +
    geom_point(aes(x = x1, y = y), color = "blue", size = 0.3) +
    stat_smooth(aes(x = x1, y = y)) +
    annotate("segment", x = split1, xend = 1, 
             y = mean(data_filter$y[data_filter$x1>split1]), 
             yend = mean(data_filter$y[data_filter$x1>split1]), 
             colour = "black", linetype = 2, size = 1.5) +
    annotate("segment", x = 0, xend = split1, 
             y = mean(data_filter$y[data_filter$x1<split1]), 
             yend = mean(data_filter$y[data_filter$x1<split1]), 
             colour = "black", linetype = 2, size = 1.5) +
    annotate("text", x = split1-0.05, y = 0.6 , label = "split", color = "red") +
    geom_vline(xintercept = split1, color = "red") +
    ggtitle("First variable")
g2 <- data_filter %>% 
    ggplot() +
    geom_point(aes(x = x2, y = y), color = "red", size = 0.3) +
    stat_smooth(aes(x = x2, y = y)) +
    annotate("segment", x = split2, xend = 1, 
             y = mean(data_filter$y[data_filter$x2>split2]), 
             yend = mean(data_filter$y[data_filter$x2>split2]), 
             colour = "black", linetype = 2, size = 1.5) +
    annotate("segment", x = 0, xend = split2, 
             y = mean(data_filter$y[data_filter$x2<split2]), 
             yend = mean(data_filter$y[data_filter$x2<split2]), 
             colour = "black", linetype = 2, size = 1.5) +
    annotate("text", x = split2-0.05, y = 0.6 , label = "split", color = "red") +
    geom_vline(xintercept = split2, color = "red") +
    ggtitle("Second variable")
grid.arrange(g1,g2)

LS0tCnRpdGxlOiAiVGhlIHRveSBleGFtcGxlIgpvdXRwdXQ6IAogICAgaHRtbF9ub3RlYm9vawotLS0KCiMgUGFja2FnZXM6IG1ha2Ugc3VyZSB0aGV5IGFyZSBpbnN0YWxsZWQKCmBgYHtyIHBhY2t9CmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KHJwYXJ0KQpsaWJyYXJ5KHJwYXJ0LnBsb3QpCmxpYnJhcnkoZ3JpZEV4dHJhKQpgYGAKCgojIFRveSBleGFtcGxlOiBzZXR1cAoKV2UgcnVuIHNpbXVsYXRpb25zIGFuZCBjcmVhdGUgdGhlIGZpcnN0IGdyYXBoLgoKYGBge3IgbGluX3F1YWQsIG1lc3NhZ2UgPSBGQUxTRSwgd2FybmluZyA9IEZBTFNFfQpzYW1wbGUgPC0gMTBeMyAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAjIFNhbXBsZSBzaXplCngxIDwtIHJ1bmlmKHNhbXBsZSkgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICMgeDEgdmFyaWFibGUKeDIgPC0gcnVuaWYoc2FtcGxlKSAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIyB4MiB2YXJpYWJsZQp5IDwtIHgxLzMgKyBzaW4oMTAqeDIpLzQgKyB4Mi85IC0gMC4xMiArIHJub3JtKHNhbXBsZSkvMTAKZGF0YSA8LSBkYXRhLmZyYW1lKHksIHgxLCB4MikgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIyBBZ2dyZWdhdGUgZGF0YXNldApkYXRhICU+JSBnZ3Bsb3QoKSArICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAjIFBsb3QKICAgIGdlb21fcG9pbnQoYWVzKHggPSB4MSwgeSA9IHkpLCBjb2xvciA9ICJibHVlMiIsIHNpemUgPSAwLjMsIGFscGhhID0gMC40KSArCiAgICBnZW9tX3BvaW50KGFlcyh4ID0geDIsIHkgPSB5KSwgY29sb3IgPSAicmVkMiIsIHNpemUgPSAwLjMsIGFscGhhID0gMC40KSArCiAgICBzdGF0X3Ntb290aChhZXMoeCA9IHgxLCB5ID0geSksIGNvbG9yID0gImJsdWUyIikgKwogICAgc3RhdF9zbW9vdGgoYWVzKHggPSB4MiwgeSA9IHkpLCBjb2xvciA9ICJyZWQyIikgKwogICAgeGxhYigieDEgKGJsdWUpIG9yIHgyIChyZWQpIikKYGBgCgoKIyBHYWluIGFuYWx5c2lzIChwcmUtZmlsdGVyKQoKV2UgY29tcHV0ZSB0aGUgaW1wb3J0YW50IGluZGljYXRvcnMgc3RlbW1pbmcgZnJvbSBzcGxpdHMgKHRoZSBnYWlucyBlc3NlbnRpYWxseSkuCgpgYGB7ciBnYWlucywgbWVzc2FnZSA9IEZBTFNFLCB3YXJuaW5nID0gRkFMU0V9CnNhbXBsZSA8LSAxMF42ICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICMgTW9yZSBwb2ludHMgJiB0aGVuIHN0YXJ0IGFnYWluLgp4MSA8LSBydW5pZihzYW1wbGUpICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAjIHgxIHZhcmlhYmxlCngyIDwtIHJ1bmlmKHNhbXBsZSkgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICMgeDIgdmFyaWFibGUKeSA8LSB4MS8zICsgc2luKDEwKngyKS80ICsgeDIvOSAtIDAuMTIgKyBybm9ybShzYW1wbGUpLzEwCmRhdGEgPC0gZGF0YS5mcmFtZSh5LCB4MSwgeDIpIAoKZ2FpbiA8LSBmdW5jdGlvbihmaXQpeyAgICAgIyBTaW1wbGUgZnVuY3Rpb24gdG8gZXh0cmFjdCB0aGUgZ2FpbnMKICAgIG91dF9maXQgPC0gZml0JGZyYW1lCiAgICB2YXIxMCA8LSBvdXRfZml0JGRldlsxXQogICAgdmFyMTEgPC0gb3V0X2ZpdCRkZXZbMl0gIAogICAgdmFyMTIgPC0gb3V0X2ZpdCRkZXZbM10KICAgIHJldHVybigodmFyMTAtdmFyMTEtdmFyMTIpL291dF9maXQkblsxXSkKfQoKZml0MSA8LSBycGFydCh5IH4geDEsIGRhdGEgPSBkYXRhLCBtYXhkZXB0aCA9IDEpCmdhaW4oZml0MSkgIyBHYWluIHJlYWxpemVkIGJ5IGZpcnN0IHZhcmlhYmxlCmZpdDIgPC0gcnBhcnQoeSB+IHgyLCBkYXRhID0gZGF0YSwgbWF4ZGVwdGggPSAxKQpnYWluKGZpdDIpICMgR2FpbiByZWFsaXplZCBieSBzZWNvbmQgdmFyaWFibGUKYGBgCgpUaGUgc2Vjb25kIHZhcmlhYmxlIGxlYWRzIHRvIHRoZSBoaWdoZXN0IGdhaW46IHRoaXMgaXMgdGhlIG9uZSB0aGF0IHdvdWxkIGJlIGNob3Nlbi4KCkFuZCBwbG90IHRoZSBjb3JyZXNwb25kaW5nIHRyZWVzLgpgYGB7ciB0cmVlX3ByZWZpbHRlcn0KbGF5b3V0KG1hdHJpeChjKDEsMiksIDEsIDIsIGJ5cm93ID0gVFJVRSkpCnJwYXJ0LnBsb3QoZml0MSkKcnBhcnQucGxvdChmaXQyKQpgYGAKCiMgVHJlZSBwb3N0LWZpbHRlcgoKV2Ugc2hvdyB3aGF0IGhhcHBlbnMgaWYgeW91IGZpbHRlciB0aGUgZGF0YSB1cGZyb250OiB0aGUgc3BsaXQgY2hhbmdlcy4KCmBgYHtyIHRyZWVfZmlsdGVyfQpxIDwtIDAuMTUKc2FtcGxlIDwtIDEwXjQgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIyBHZW5lcmF0ZSBhZ2Fpbi4KeDEgPC0gcnVuaWYoc2FtcGxlKSAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIyB4MSB2YXJpYWJsZQp4MiA8LSBydW5pZihzYW1wbGUpICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAjIHgyIHZhcmlhYmxlCnkgPC0geDEvMyArIHNpbigxMCp4MikvNCArIHgyLzkgLSAwLjEyICsgcm5vcm0oc2FtcGxlKS8xMApkYXRhIDwtIGRhdGEuZnJhbWUoeSwgeDEsIHgyKSAKZGF0YV9maWx0ZXIgPC0gZGF0YSAlPiUgZmlsdGVyKHkgPCBxdWFudGlsZSh5LHEpIHwgeSA+IHF1YW50aWxlKHksIDEtcSkpCmZpdCA8LSBycGFydCh5IH4geDEgKyB4MiwgZGF0YSA9IGRhdGFfZmlsdGVyLCBtYXhkZXB0aCA9IDEpICAKcnBhcnQucGxvdChmaXQpCmBgYAoKIyBHcmFwaGljYWwgZXhwbGFuYXRpb24KClRoaXMgb25lIGlzIGEgYml0IGxvbmcsIHdlIGtlZXAgYSBsb3Qgb2YgaW5mb3JtYXRpb24uIFdlIHNob3cgdGhlIHNlcGFyYXRpb25zIGRpY3RhdGVkIGJ5IHRoZSBzcGxpdHMgdmlhIGEgdmVydGljYWwgbGluZS4KCmBgYHtyIGZpbHRlciwgbWVzc2FnZSA9IEZBTFNFLCB3YXJuaW5nID0gRkFMU0V9CmZpdDEgPC0gcnBhcnQoeSB+IHgxLCBkYXRhID0gZGF0YV9maWx0ZXIsIG1heGRlcHRoID0gMSkKZml0MiA8LSBycGFydCh5IH4geDIsIGRhdGEgPSBkYXRhX2ZpbHRlciwgbWF4ZGVwdGggPSAxKQpzcGxpdDEgPC0gZml0MSRzcGxpdHNbZml0MSRzcGxpdHNbLDNdPT1taW4oZml0MSRzcGxpdHNbLDNdKSw0XQpzcGxpdDIgPC0gZml0MiRzcGxpdHNbZml0MiRzcGxpdHNbLDNdPT1taW4oZml0MiRzcGxpdHNbLDNdKSw0XQpkYXRhX2ZpbHRlciA8LSBkYXRhICU+JSBmaWx0ZXIoeSA8IHF1YW50aWxlKHkscSkgfCB5ID4gcXVhbnRpbGUoeSwgMS1xKSkKCmcxIDwtIGRhdGFfZmlsdGVyICU+JSAKICAgIGdncGxvdCgpICsKICAgIGdlb21fcG9pbnQoYWVzKHggPSB4MSwgeSA9IHkpLCBjb2xvciA9ICJibHVlIiwgc2l6ZSA9IDAuMykgKwogICAgc3RhdF9zbW9vdGgoYWVzKHggPSB4MSwgeSA9IHkpKSArCiAgICBhbm5vdGF0ZSgic2VnbWVudCIsIHggPSBzcGxpdDEsIHhlbmQgPSAxLCAKICAgICAgICAgICAgIHkgPSBtZWFuKGRhdGFfZmlsdGVyJHlbZGF0YV9maWx0ZXIkeDE+c3BsaXQxXSksIAogICAgICAgICAgICAgeWVuZCA9IG1lYW4oZGF0YV9maWx0ZXIkeVtkYXRhX2ZpbHRlciR4MT5zcGxpdDFdKSwgCiAgICAgICAgICAgICBjb2xvdXIgPSAiYmxhY2siLCBsaW5ldHlwZSA9IDIsIHNpemUgPSAxLjUpICsKICAgIGFubm90YXRlKCJzZWdtZW50IiwgeCA9IDAsIHhlbmQgPSBzcGxpdDEsIAogICAgICAgICAgICAgeSA9IG1lYW4oZGF0YV9maWx0ZXIkeVtkYXRhX2ZpbHRlciR4MTxzcGxpdDFdKSwgCiAgICAgICAgICAgICB5ZW5kID0gbWVhbihkYXRhX2ZpbHRlciR5W2RhdGFfZmlsdGVyJHgxPHNwbGl0MV0pLCAKICAgICAgICAgICAgIGNvbG91ciA9ICJibGFjayIsIGxpbmV0eXBlID0gMiwgc2l6ZSA9IDEuNSkgKwogICAgYW5ub3RhdGUoInRleHQiLCB4ID0gc3BsaXQxLTAuMDUsIHkgPSAwLjYgLCBsYWJlbCA9ICJzcGxpdCIsIGNvbG9yID0gInJlZCIpICsKICAgIGdlb21fdmxpbmUoeGludGVyY2VwdCA9IHNwbGl0MSwgY29sb3IgPSAicmVkIikgKwogICAgZ2d0aXRsZSgiRmlyc3QgdmFyaWFibGUiKQoKZzIgPC0gZGF0YV9maWx0ZXIgJT4lIAogICAgZ2dwbG90KCkgKwogICAgZ2VvbV9wb2ludChhZXMoeCA9IHgyLCB5ID0geSksIGNvbG9yID0gInJlZCIsIHNpemUgPSAwLjMpICsKICAgIHN0YXRfc21vb3RoKGFlcyh4ID0geDIsIHkgPSB5KSkgKwogICAgYW5ub3RhdGUoInNlZ21lbnQiLCB4ID0gc3BsaXQyLCB4ZW5kID0gMSwgCiAgICAgICAgICAgICB5ID0gbWVhbihkYXRhX2ZpbHRlciR5W2RhdGFfZmlsdGVyJHgyPnNwbGl0Ml0pLCAKICAgICAgICAgICAgIHllbmQgPSBtZWFuKGRhdGFfZmlsdGVyJHlbZGF0YV9maWx0ZXIkeDI+c3BsaXQyXSksIAogICAgICAgICAgICAgY29sb3VyID0gImJsYWNrIiwgbGluZXR5cGUgPSAyLCBzaXplID0gMS41KSArCiAgICBhbm5vdGF0ZSgic2VnbWVudCIsIHggPSAwLCB4ZW5kID0gc3BsaXQyLCAKICAgICAgICAgICAgIHkgPSBtZWFuKGRhdGFfZmlsdGVyJHlbZGF0YV9maWx0ZXIkeDI8c3BsaXQyXSksIAogICAgICAgICAgICAgeWVuZCA9IG1lYW4oZGF0YV9maWx0ZXIkeVtkYXRhX2ZpbHRlciR4MjxzcGxpdDJdKSwgCiAgICAgICAgICAgICBjb2xvdXIgPSAiYmxhY2siLCBsaW5ldHlwZSA9IDIsIHNpemUgPSAxLjUpICsKICAgIGFubm90YXRlKCJ0ZXh0IiwgeCA9IHNwbGl0Mi0wLjA1LCB5ID0gMC42ICwgbGFiZWwgPSAic3BsaXQiLCBjb2xvciA9ICJyZWQiKSArCiAgICBnZW9tX3ZsaW5lKHhpbnRlcmNlcHQgPSBzcGxpdDIsIGNvbG9yID0gInJlZCIpICsKICAgIGdndGl0bGUoIlNlY29uZCB2YXJpYWJsZSIpCmdyaWQuYXJyYW5nZShnMSxnMikKYGBg