First, we install the partitioning packages.

if(!require(rpart)){install.packages(c("rpart"))}
if(!require(rpart.plot)){install.packages(c("rpart.plot"))}

Then, we activate the relevant libraries.

library(tidyverse)
library(rpart)
library(rpart.plot)
library(gapminder)

Finally, we load the datasets.

load("movies.RData")
load("songs.RData")

The syntax of models is simple and matches that of linear models. It’s: dependent variable ~ var1 + var2 + …
cp controls the depth of the tree: the smaller it is, the deeper the tree.
maxdepth limits the depth.

fit <- rpart(likes ~ budget + earnings + color + duration + country,# + language, 
             data = movies, 
             cp = 0.001, 
             maxdepth = 3)
rpart.plot(fit)

The main driver of imdb_score is the length of the movie. Short movies (shorter than 111 minutes) have an average score of 6.2 while longer ones have an average score of 6.9. Very long movies (length larger than 138 minutes have the highest score: 7.4 on average).

movies <- movies %>% 
    mutate(color = as.factor(color),
           country = as.factor(country),
           rating = as.factor(rating))
fit_movies <- rpart(imdb_score ~ . ,              # Short format to include all variables !!!
             data = movies %>% select(-title, -director, -actor_1, -actor_2, -actor_3), 
             cp = 0.001, 
             maxdepth = 2)
rpart.plot(fit_movies)
Bad 'data' field in model 'call' (expected a data.frame or a matrix).
To silence this warning:
    Call rpart.plot with roundint=FALSE,
    or rebuild the rpart model with model=TRUE.

With songs.

fit_songs <- rpart(popularity ~ .,    # Short formula!
             data = songs %>% select(-song_name, -artist, -album),  # Remove unnecessary variables
             method = "anova", # Method: regression "anova" vs classification "class"
             cp = 0.0001, 
             maxdepth = 3)
rpart.plot(fit_songs)

Models can be complicated.

fit_songs2 <- rpart(popularity ~ .,    # Short formula!
             data = songs %>% select(-song_name, -artist, -album),  # Remove unnecessary variables
             method = "anova", # Method: regression "anova" vs classification "class"
             cp = 0.0001, 
             maxdepth = 5)
rpart.plot(fit_songs2)

Decision trees are great for predictions:

predict(fit_songs2, songs[9000,]) # Prediction
       1 
48.66515 
songs[9000,]$popularity   
[1] 45

Performance indicators:

all_predictions <- predict(fit_songs, songs)
mean(abs(all_predictions - songs$popularity))
1 - mean((all_predictions - songs$popularity)^2) / mean((songs$popularity - mean(songs$popularity))^2)

GAPMINDER

data("gapminder")
force(gapminder)
fit <- rpart(lifeExp ~ pop + gdpPercap,
             data = gapminder %>% filter(year > 2005),
             cp = 0.0001,
             maxdepth = 3)
rpart.plot(fit)
Bad 'data' field in model 'call' (expected a data.frame or a matrix).
To silence this warning:
    Call rpart.plot with roundint=FALSE,
    or rebuild the rpart model with model=TRUE.

Classification

fit_color <- rpart(color ~ .,    # Short formula!
             data = movies %>% select(-title, -director, -actor_1, -actor_2, -actor_3),  # Remove unnecessary variables
             method = "class", # Method: regression "anova" vs classification "class"
             cp = 0.0001, 
             maxdepth = 3)
rpart.plot(fit_color)
Bad 'data' field in model 'call' (expected a data.frame or a matrix).
To silence this warning:
    Call rpart.plot with roundint=FALSE,
    or rebuild the rpart model with model=TRUE.

Other dendrograms

movies[1:60,] %>%    # First 50 movies
    select(budget, earnings, duration, imdb_score) %>%
    dist() %>%       # Computes distance
    hclust() %>%     # Creates clusters
    plot(cex = 0.7)  # Plots

Gapminder?

LS0tCnRpdGxlOiAiUzc6IERlY2lzaW9uIFRyZWVzIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgpGaXJzdCwgd2UgaW5zdGFsbCB0aGUgcGFydGl0aW9uaW5nIHBhY2thZ2VzLgoKYGBge3J9CmlmKCFyZXF1aXJlKHJwYXJ0KSl7aW5zdGFsbC5wYWNrYWdlcyhjKCJycGFydCIpKX0KaWYoIXJlcXVpcmUocnBhcnQucGxvdCkpe2luc3RhbGwucGFja2FnZXMoYygicnBhcnQucGxvdCIpKX0KYGBgCgpUaGVuLCB3ZSBhY3RpdmF0ZSB0aGUgcmVsZXZhbnQgbGlicmFyaWVzLgoKYGBge3IsIG1lc3NhZ2UgPSBGQUxTRSwgd2FybmluZyA9IEZBTFNFfQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeShycGFydCkKbGlicmFyeShycGFydC5wbG90KQpsaWJyYXJ5KGdhcG1pbmRlcikKYGBgCgpGaW5hbGx5LCB3ZSBsb2FkIHRoZSBkYXRhc2V0cy4KCmBgYHtyfQpsb2FkKCJtb3ZpZXMuUkRhdGEiKQpsb2FkKCJzb25ncy5SRGF0YSIpCmBgYAoKVGhlIHN5bnRheCBvZiBtb2RlbHMgaXMgc2ltcGxlIGFuZCBtYXRjaGVzIHRoYXQgb2YgbGluZWFyIG1vZGVscy4KSXQnczogKipkZXBlbmRlbnQgdmFyaWFibGUgfiB2YXIxICsgdmFyMiArIC4uLioqICAgIAoqKmNwKiogY29udHJvbHMgdGhlIGRlcHRoIG9mIHRoZSB0cmVlOiB0aGUgc21hbGxlciBpdCBpcywgdGhlIGRlZXBlciB0aGUgdHJlZS4gIAoqKm1heGRlcHRoKiogbGltaXRzIHRoZSBkZXB0aC4KCmBgYHtyfQpmaXQgPC0gcnBhcnQoaW1kYl9zY29yZSB+IGJ1ZGdldCArIGVhcm5pbmdzICsgY29sb3IgKyBkdXJhdGlvbiArIGNvdW50cnkgKyBsYW5ndWFnZSwgCiAgICAgICAgICAgICBkYXRhID0gbW92aWVzLCAKICAgICAgICAgICAgIGNwID0gMC4wMDEsIAogICAgICAgICAgICAgbWF4ZGVwdGggPSAzKQpycGFydC5wbG90KGZpdCkKYGBgCgpUaGUgbWFpbiBkcml2ZXIgb2YgKippbWRiX3Njb3JlKiogaXMgdGhlIGxlbmd0aCBvZiB0aGUgbW92aWUuIFNob3J0IG1vdmllcyAoc2hvcnRlciB0aGFuIDExMSBtaW51dGVzKSBoYXZlIGFuIGF2ZXJhZ2Ugc2NvcmUgb2YgNi4yIHdoaWxlIGxvbmdlciBvbmVzIGhhdmUgYW4gYXZlcmFnZSBzY29yZSBvZiA2LjkuIFZlcnkgbG9uZyBtb3ZpZXMgKGxlbmd0aCBsYXJnZXIgdGhhbiAxMzggbWludXRlcyBoYXZlIHRoZSBoaWdoZXN0IHNjb3JlOiA3LjQgb24gYXZlcmFnZSkuIAoKYGBge3J9Cm1vdmllcyA8LSBtb3ZpZXMgJT4lIAogICAgbXV0YXRlKGNvbG9yID0gYXMuZmFjdG9yKGNvbG9yKSwKICAgICAgICAgICBjb3VudHJ5ID0gYXMuZmFjdG9yKGNvdW50cnkpLAogICAgICAgICAgIHJhdGluZyA9IGFzLmZhY3RvcihyYXRpbmcpKQpmaXRfbW92aWVzIDwtIHJwYXJ0KGltZGJfc2NvcmUgfiAuICwgICAgICAgICAgICAgICMgU2hvcnQgZm9ybWF0IHRvIGluY2x1ZGUgYWxsIHZhcmlhYmxlcyAhISEKICAgICAgICAgICAgIGRhdGEgPSBtb3ZpZXMgJT4lIHNlbGVjdCgtdGl0bGUsIC1kaXJlY3RvciwgLWFjdG9yXzEsIC1hY3Rvcl8yLCAtYWN0b3JfMyksIAogICAgICAgICAgICAgY3AgPSAwLjAwMSwgCiAgICAgICAgICAgICBtYXhkZXB0aCA9IDIpCnJwYXJ0LnBsb3QoZml0X21vdmllcykKYGBgCgpXaXRoIHNvbmdzLiAKCmBgYHtyLCBtZXNzYWdlID0gRkFMU0UsIHdhcm5pbmcgPSBGQUxTRX0KZml0X3NvbmdzIDwtIHJwYXJ0KHBvcHVsYXJpdHkgfiAuLCAgICAjIFNob3J0IGZvcm11bGEhCiAgICAgICAgICAgICBkYXRhID0gc29uZ3MgJT4lIHNlbGVjdCgtc29uZ19uYW1lLCAtYXJ0aXN0LCAtYWxidW0pLCAgIyBSZW1vdmUgdW5uZWNlc3NhcnkgdmFyaWFibGVzCiAgICAgICAgICAgICBtZXRob2QgPSAiYW5vdmEiLCAjIE1ldGhvZDogcmVncmVzc2lvbiAiYW5vdmEiIHZzIGNsYXNzaWZpY2F0aW9uICJjbGFzcyIKICAgICAgICAgICAgIGNwID0gMC4wMDAxLCAKICAgICAgICAgICAgIG1heGRlcHRoID0gMykKcnBhcnQucGxvdChmaXRfc29uZ3MpCmBgYAoKTW9kZWxzIGNhbiBiZSBjb21wbGljYXRlZC4KCmBgYHtyLCBtZXNzYWdlID0gRkFMU0UsIHdhcm5pbmcgPSBGQUxTRX0KZml0X3NvbmdzMiA8LSBycGFydChwb3B1bGFyaXR5IH4gLiwgICAgIyBTaG9ydCBmb3JtdWxhIQogICAgICAgICAgICAgZGF0YSA9IHNvbmdzICU+JSBzZWxlY3QoLXNvbmdfbmFtZSwgLWFydGlzdCwgLWFsYnVtKSwgICMgUmVtb3ZlIHVubmVjZXNzYXJ5IHZhcmlhYmxlcwogICAgICAgICAgICAgbWV0aG9kID0gImFub3ZhIiwgIyBNZXRob2Q6IHJlZ3Jlc3Npb24gImFub3ZhIiB2cyBjbGFzc2lmaWNhdGlvbiAiY2xhc3MiCiAgICAgICAgICAgICBjcCA9IDAuMDAwMSwgCiAgICAgICAgICAgICBtYXhkZXB0aCA9IDUpCnJwYXJ0LnBsb3QoZml0X3NvbmdzMikKYGBgCgpEZWNpc2lvbiB0cmVlcyBhcmUgZ3JlYXQgZm9yIHByZWRpY3Rpb25zOgoKYGBge3J9CnByZWRpY3QoZml0X3NvbmdzMiwgc29uZ3NbOTAwMCxdKSAjIFByZWRpY3Rpb24Kc29uZ3NbOTAwMCxdJHBvcHVsYXJpdHkgICAgICAgICAgIyBUcnVlIHZhbHVlCmBgYAoKUGVyZm9ybWFuY2UgaW5kaWNhdG9yczoKCmBgYHtyfQphbGxfcHJlZGljdGlvbnMgPC0gcHJlZGljdChmaXRfc29uZ3MsIHNvbmdzKQptZWFuKGFicyhhbGxfcHJlZGljdGlvbnMgLSBzb25ncyRwb3B1bGFyaXR5KSkKYGBgCgoKYGBge3J9CjEgLSBtZWFuKChhbGxfcHJlZGljdGlvbnMgLSBzb25ncyRwb3B1bGFyaXR5KV4yKSAvIG1lYW4oKHNvbmdzJHBvcHVsYXJpdHkgLSBtZWFuKHNvbmdzJHBvcHVsYXJpdHkpKV4yKQpgYGAKCgpHQVBNSU5ERVIKCmBgYHtyfQpkYXRhKCJnYXBtaW5kZXIiKQpmb3JjZShnYXBtaW5kZXIpCmZpdCA8LSBycGFydChsaWZlRXhwIH4gcG9wICsgZ2RwUGVyY2FwLAogICAgICAgICAgICAgZGF0YSA9IGdhcG1pbmRlciAlPiUgZmlsdGVyKHllYXIgPiAyMDA1KSwKICAgICAgICAgICAgIGNwID0gMC4wMDAxLAogICAgICAgICAgICAgbWF4ZGVwdGggPSAzKQpycGFydC5wbG90KGZpdCkKYGBgCgoKCiMgQ2xhc3NpZmljYXRpb24KCmBgYHtyfQpmaXRfY29sb3IgPC0gcnBhcnQoY29sb3IgfiAuLCAgICAjIFNob3J0IGZvcm11bGEhCiAgICAgICAgICAgICBkYXRhID0gbW92aWVzICU+JSBzZWxlY3QoLXRpdGxlLCAtZGlyZWN0b3IsIC1hY3Rvcl8xLCAtYWN0b3JfMiwgLWFjdG9yXzMpLCAgIyBSZW1vdmUgdW5uZWNlc3NhcnkgdmFyaWFibGVzCiAgICAgICAgICAgICBtZXRob2QgPSAiY2xhc3MiLCAjIE1ldGhvZDogcmVncmVzc2lvbiAiYW5vdmEiIHZzIGNsYXNzaWZpY2F0aW9uICJjbGFzcyIKICAgICAgICAgICAgIGNwID0gMC4wMDAxLCAKICAgICAgICAgICAgIG1heGRlcHRoID0gMykKcnBhcnQucGxvdChmaXRfY29sb3IpCmBgYAoKCgojIE90aGVyIGRlbmRyb2dyYW1zCgpgYGB7cn0KbW92aWVzWzE6NjAsXSAlPiUgICAgIyBGaXJzdCA1MCBtb3ZpZXMKICAgIHNlbGVjdChidWRnZXQsIGVhcm5pbmdzLCBkdXJhdGlvbiwgaW1kYl9zY29yZSkgJT4lCiAgICBkaXN0KCkgJT4lICAgICAgICMgQ29tcHV0ZXMgZGlzdGFuY2UKICAgIGhjbHVzdCgpICU+JSAgICAgIyBDcmVhdGVzIGNsdXN0ZXJzCiAgICBwbG90KGNleCA9IDAuNykgICMgUGxvdHMKYGBgCgpHYXBtaW5kZXI/CgpgYGB7cn0KCmBgYAoK