First, we install the partitioning packages.
if(!require(rpart)){install.packages(c("rpart"))}
if(!require(rpart.plot)){install.packages(c("rpart.plot"))}
Then, we activate the relevant libraries.
library(tidyverse)
library(rpart)
library(rpart.plot)
library(gapminder)
Finally, we load the datasets.
load("movies.RData")
load("songs.RData")
The syntax of models is simple and matches that of linear models. It’s: dependent variable ~ var1 + var2 + …
cp controls the depth of the tree: the smaller it is, the deeper the tree.
maxdepth limits the depth.
fit <- rpart(likes ~ budget + earnings + color + duration + country,# + language,
data = movies,
cp = 0.001,
maxdepth = 3)
rpart.plot(fit)

The main driver of imdb_score is the length of the movie. Short movies (shorter than 111 minutes) have an average score of 6.2 while longer ones have an average score of 6.9. Very long movies (length larger than 138 minutes have the highest score: 7.4 on average).
movies <- movies %>%
mutate(color = as.factor(color),
country = as.factor(country),
rating = as.factor(rating))
fit_movies <- rpart(imdb_score ~ . , # Short format to include all variables !!!
data = movies %>% select(-title, -director, -actor_1, -actor_2, -actor_3),
cp = 0.001,
maxdepth = 2)
rpart.plot(fit_movies)
Bad 'data' field in model 'call' (expected a data.frame or a matrix).
To silence this warning:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.

With songs.
fit_songs <- rpart(popularity ~ ., # Short formula!
data = songs %>% select(-song_name, -artist, -album), # Remove unnecessary variables
method = "anova", # Method: regression "anova" vs classification "class"
cp = 0.0001,
maxdepth = 3)
rpart.plot(fit_songs)

Models can be complicated.
fit_songs2 <- rpart(popularity ~ ., # Short formula!
data = songs %>% select(-song_name, -artist, -album), # Remove unnecessary variables
method = "anova", # Method: regression "anova" vs classification "class"
cp = 0.0001,
maxdepth = 5)
rpart.plot(fit_songs2)

Decision trees are great for predictions:
predict(fit_songs2, songs[9000,]) # Prediction
1
48.66515
songs[9000,]$popularity
[1] 45
Performance indicators:
all_predictions <- predict(fit_songs, songs)
mean(abs(all_predictions - songs$popularity))
1 - mean((all_predictions - songs$popularity)^2) / mean((songs$popularity - mean(songs$popularity))^2)
GAPMINDER
data("gapminder")
force(gapminder)
fit <- rpart(lifeExp ~ pop + gdpPercap,
data = gapminder %>% filter(year > 2005),
cp = 0.0001,
maxdepth = 3)
rpart.plot(fit)
Bad 'data' field in model 'call' (expected a data.frame or a matrix).
To silence this warning:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.

Classification
fit_color <- rpart(color ~ ., # Short formula!
data = movies %>% select(-title, -director, -actor_1, -actor_2, -actor_3), # Remove unnecessary variables
method = "class", # Method: regression "anova" vs classification "class"
cp = 0.0001,
maxdepth = 3)
rpart.plot(fit_color)
Bad 'data' field in model 'call' (expected a data.frame or a matrix).
To silence this warning:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.

Other dendrograms
movies[1:60,] %>% # First 50 movies
select(budget, earnings, duration, imdb_score) %>%
dist() %>% # Computes distance
hclust() %>% # Creates clusters
plot(cex = 0.7) # Plots

Gapminder?
LS0tCnRpdGxlOiAiUzc6IERlY2lzaW9uIFRyZWVzIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgpGaXJzdCwgd2UgaW5zdGFsbCB0aGUgcGFydGl0aW9uaW5nIHBhY2thZ2VzLgoKYGBge3J9CmlmKCFyZXF1aXJlKHJwYXJ0KSl7aW5zdGFsbC5wYWNrYWdlcyhjKCJycGFydCIpKX0KaWYoIXJlcXVpcmUocnBhcnQucGxvdCkpe2luc3RhbGwucGFja2FnZXMoYygicnBhcnQucGxvdCIpKX0KYGBgCgpUaGVuLCB3ZSBhY3RpdmF0ZSB0aGUgcmVsZXZhbnQgbGlicmFyaWVzLgoKYGBge3IsIG1lc3NhZ2UgPSBGQUxTRSwgd2FybmluZyA9IEZBTFNFfQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeShycGFydCkKbGlicmFyeShycGFydC5wbG90KQpsaWJyYXJ5KGdhcG1pbmRlcikKYGBgCgpGaW5hbGx5LCB3ZSBsb2FkIHRoZSBkYXRhc2V0cy4KCmBgYHtyfQpsb2FkKCJtb3ZpZXMuUkRhdGEiKQpsb2FkKCJzb25ncy5SRGF0YSIpCmBgYAoKVGhlIHN5bnRheCBvZiBtb2RlbHMgaXMgc2ltcGxlIGFuZCBtYXRjaGVzIHRoYXQgb2YgbGluZWFyIG1vZGVscy4KSXQnczogKipkZXBlbmRlbnQgdmFyaWFibGUgfiB2YXIxICsgdmFyMiArIC4uLioqICAgIAoqKmNwKiogY29udHJvbHMgdGhlIGRlcHRoIG9mIHRoZSB0cmVlOiB0aGUgc21hbGxlciBpdCBpcywgdGhlIGRlZXBlciB0aGUgdHJlZS4gIAoqKm1heGRlcHRoKiogbGltaXRzIHRoZSBkZXB0aC4KCmBgYHtyfQpmaXQgPC0gcnBhcnQoaW1kYl9zY29yZSB+IGJ1ZGdldCArIGVhcm5pbmdzICsgY29sb3IgKyBkdXJhdGlvbiArIGNvdW50cnkgKyBsYW5ndWFnZSwgCiAgICAgICAgICAgICBkYXRhID0gbW92aWVzLCAKICAgICAgICAgICAgIGNwID0gMC4wMDEsIAogICAgICAgICAgICAgbWF4ZGVwdGggPSAzKQpycGFydC5wbG90KGZpdCkKYGBgCgpUaGUgbWFpbiBkcml2ZXIgb2YgKippbWRiX3Njb3JlKiogaXMgdGhlIGxlbmd0aCBvZiB0aGUgbW92aWUuIFNob3J0IG1vdmllcyAoc2hvcnRlciB0aGFuIDExMSBtaW51dGVzKSBoYXZlIGFuIGF2ZXJhZ2Ugc2NvcmUgb2YgNi4yIHdoaWxlIGxvbmdlciBvbmVzIGhhdmUgYW4gYXZlcmFnZSBzY29yZSBvZiA2LjkuIFZlcnkgbG9uZyBtb3ZpZXMgKGxlbmd0aCBsYXJnZXIgdGhhbiAxMzggbWludXRlcyBoYXZlIHRoZSBoaWdoZXN0IHNjb3JlOiA3LjQgb24gYXZlcmFnZSkuIAoKYGBge3J9Cm1vdmllcyA8LSBtb3ZpZXMgJT4lIAogICAgbXV0YXRlKGNvbG9yID0gYXMuZmFjdG9yKGNvbG9yKSwKICAgICAgICAgICBjb3VudHJ5ID0gYXMuZmFjdG9yKGNvdW50cnkpLAogICAgICAgICAgIHJhdGluZyA9IGFzLmZhY3RvcihyYXRpbmcpKQpmaXRfbW92aWVzIDwtIHJwYXJ0KGltZGJfc2NvcmUgfiAuICwgICAgICAgICAgICAgICMgU2hvcnQgZm9ybWF0IHRvIGluY2x1ZGUgYWxsIHZhcmlhYmxlcyAhISEKICAgICAgICAgICAgIGRhdGEgPSBtb3ZpZXMgJT4lIHNlbGVjdCgtdGl0bGUsIC1kaXJlY3RvciwgLWFjdG9yXzEsIC1hY3Rvcl8yLCAtYWN0b3JfMyksIAogICAgICAgICAgICAgY3AgPSAwLjAwMSwgCiAgICAgICAgICAgICBtYXhkZXB0aCA9IDIpCnJwYXJ0LnBsb3QoZml0X21vdmllcykKYGBgCgpXaXRoIHNvbmdzLiAKCmBgYHtyLCBtZXNzYWdlID0gRkFMU0UsIHdhcm5pbmcgPSBGQUxTRX0KZml0X3NvbmdzIDwtIHJwYXJ0KHBvcHVsYXJpdHkgfiAuLCAgICAjIFNob3J0IGZvcm11bGEhCiAgICAgICAgICAgICBkYXRhID0gc29uZ3MgJT4lIHNlbGVjdCgtc29uZ19uYW1lLCAtYXJ0aXN0LCAtYWxidW0pLCAgIyBSZW1vdmUgdW5uZWNlc3NhcnkgdmFyaWFibGVzCiAgICAgICAgICAgICBtZXRob2QgPSAiYW5vdmEiLCAjIE1ldGhvZDogcmVncmVzc2lvbiAiYW5vdmEiIHZzIGNsYXNzaWZpY2F0aW9uICJjbGFzcyIKICAgICAgICAgICAgIGNwID0gMC4wMDAxLCAKICAgICAgICAgICAgIG1heGRlcHRoID0gMykKcnBhcnQucGxvdChmaXRfc29uZ3MpCmBgYAoKTW9kZWxzIGNhbiBiZSBjb21wbGljYXRlZC4KCmBgYHtyLCBtZXNzYWdlID0gRkFMU0UsIHdhcm5pbmcgPSBGQUxTRX0KZml0X3NvbmdzMiA8LSBycGFydChwb3B1bGFyaXR5IH4gLiwgICAgIyBTaG9ydCBmb3JtdWxhIQogICAgICAgICAgICAgZGF0YSA9IHNvbmdzICU+JSBzZWxlY3QoLXNvbmdfbmFtZSwgLWFydGlzdCwgLWFsYnVtKSwgICMgUmVtb3ZlIHVubmVjZXNzYXJ5IHZhcmlhYmxlcwogICAgICAgICAgICAgbWV0aG9kID0gImFub3ZhIiwgIyBNZXRob2Q6IHJlZ3Jlc3Npb24gImFub3ZhIiB2cyBjbGFzc2lmaWNhdGlvbiAiY2xhc3MiCiAgICAgICAgICAgICBjcCA9IDAuMDAwMSwgCiAgICAgICAgICAgICBtYXhkZXB0aCA9IDUpCnJwYXJ0LnBsb3QoZml0X3NvbmdzMikKYGBgCgpEZWNpc2lvbiB0cmVlcyBhcmUgZ3JlYXQgZm9yIHByZWRpY3Rpb25zOgoKYGBge3J9CnByZWRpY3QoZml0X3NvbmdzMiwgc29uZ3NbOTAwMCxdKSAjIFByZWRpY3Rpb24Kc29uZ3NbOTAwMCxdJHBvcHVsYXJpdHkgICAgICAgICAgIyBUcnVlIHZhbHVlCmBgYAoKUGVyZm9ybWFuY2UgaW5kaWNhdG9yczoKCmBgYHtyfQphbGxfcHJlZGljdGlvbnMgPC0gcHJlZGljdChmaXRfc29uZ3MsIHNvbmdzKQptZWFuKGFicyhhbGxfcHJlZGljdGlvbnMgLSBzb25ncyRwb3B1bGFyaXR5KSkKYGBgCgoKYGBge3J9CjEgLSBtZWFuKChhbGxfcHJlZGljdGlvbnMgLSBzb25ncyRwb3B1bGFyaXR5KV4yKSAvIG1lYW4oKHNvbmdzJHBvcHVsYXJpdHkgLSBtZWFuKHNvbmdzJHBvcHVsYXJpdHkpKV4yKQpgYGAKCgpHQVBNSU5ERVIKCmBgYHtyfQpkYXRhKCJnYXBtaW5kZXIiKQpmb3JjZShnYXBtaW5kZXIpCmZpdCA8LSBycGFydChsaWZlRXhwIH4gcG9wICsgZ2RwUGVyY2FwLAogICAgICAgICAgICAgZGF0YSA9IGdhcG1pbmRlciAlPiUgZmlsdGVyKHllYXIgPiAyMDA1KSwKICAgICAgICAgICAgIGNwID0gMC4wMDAxLAogICAgICAgICAgICAgbWF4ZGVwdGggPSAzKQpycGFydC5wbG90KGZpdCkKYGBgCgoKCiMgQ2xhc3NpZmljYXRpb24KCmBgYHtyfQpmaXRfY29sb3IgPC0gcnBhcnQoY29sb3IgfiAuLCAgICAjIFNob3J0IGZvcm11bGEhCiAgICAgICAgICAgICBkYXRhID0gbW92aWVzICU+JSBzZWxlY3QoLXRpdGxlLCAtZGlyZWN0b3IsIC1hY3Rvcl8xLCAtYWN0b3JfMiwgLWFjdG9yXzMpLCAgIyBSZW1vdmUgdW5uZWNlc3NhcnkgdmFyaWFibGVzCiAgICAgICAgICAgICBtZXRob2QgPSAiY2xhc3MiLCAjIE1ldGhvZDogcmVncmVzc2lvbiAiYW5vdmEiIHZzIGNsYXNzaWZpY2F0aW9uICJjbGFzcyIKICAgICAgICAgICAgIGNwID0gMC4wMDAxLCAKICAgICAgICAgICAgIG1heGRlcHRoID0gMykKcnBhcnQucGxvdChmaXRfY29sb3IpCmBgYAoKCgojIE90aGVyIGRlbmRyb2dyYW1zCgpgYGB7cn0KbW92aWVzWzE6NjAsXSAlPiUgICAgIyBGaXJzdCA1MCBtb3ZpZXMKICAgIHNlbGVjdChidWRnZXQsIGVhcm5pbmdzLCBkdXJhdGlvbiwgaW1kYl9zY29yZSkgJT4lCiAgICBkaXN0KCkgJT4lICAgICAgICMgQ29tcHV0ZXMgZGlzdGFuY2UKICAgIGhjbHVzdCgpICU+JSAgICAgIyBDcmVhdGVzIGNsdXN0ZXJzCiAgICBwbG90KGNleCA9IDAuNykgICMgUGxvdHMKYGBgCgpHYXBtaW5kZXI/CgpgYGB7cn0KCmBgYAoK