Setting the data

First, install/activate the packages…

if(!require(FNN)){install.packages("FNN")}
library(tidyverse)
library(FNN)

… and load the data. (don’t forget to set your working directory).

load("songs.RData")

Next, we fix two important things: the target (around which to find neighbors) and the variables on which the distances will be computed. The dataset is songs, we choose the variables that have the same scale (from 0 to 1): danceability, energy, speechiness and valence.

target <- filter(songs, song_name == "Poker Face")
var_dist <- c("danceability", "energy", "speechiness", "valence")

Performing k-NN

Thanks to the FNN package, we are ready to go! The syntax is very simple

neighbors <- get.knnx(data = songs %>% select(all_of(var_dist)),    # Data source: be careful with the columns!
                      query = target %>% select(all_of(var_dist)),  # Target (with the right columns)
                      k = 20)                                       # Nb of neighbors

There are two outputs in neighbors: the index of the neighbors and the distance to the target.

neighbors$nn.index  # Index = row n° in the dataset
     [,1] [,2] [,3] [,4] [,5]  [,6]  [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17]
[1,] 1665 7797 8007  218  807 11416 10751 5843 5825  7984   183  7788 12332  1504  9343  9491    93
     [,18] [,19] [,20]
[1,]  8494  5509  7276
```r
neighbors$nn.dist  # The corresponding distances (in decreasing order naturally)

<!-- rnb-source-end -->

<!-- rnb-output-begin eyJkYXRhIjoiICAgICBbLDFdICAgICAgIFssMl0gICAgICAgWywzXSAgICAgICBbLDRdICAgICAgIFssNV0gICAgICBbLDZdICAgICAgIFssN10gICAgICAgWyw4XSAgICAgICBbLDldICAgICAgWywxMF1cblsxLF0gICAgMCAwLjAxMjk4NDk5IDAuMDE4ODQ4MDggMC4wMjg4NzkwNiAwLjAzMjgwNjI1IDAuMDQzMDcyNSAwLjA0Mzk2NDY0IDAuMDQ1OTg5MTMgMC4wNDYyOTU2OCAwLjA1MjYxMjE3XG4gICAgICAgICAgWywxMV0gICAgICBbLDEyXSAgICAgIFssMTNdICAgICAgWywxNF0gICAgICBbLDE1XSAgICAgIFssMTZdICAgICBbLDE3XSAgICAgIFssMThdICAgICAgWywxOV1cblsxLF0gMC4wNTM5Mjg1NiAwLjA1NjQ0Mjk4IDAuMDU4NTc2NzkgMC4wNTg3Mzk3NyAwLjA2MDEzMTg2IDAuMDYwNTcyMzUgMC4wNjIyMDMzIDAuMDYzNzAxMTggMC4wNjU0MDkxN1xuICAgICAgICAgIFssMjBdICAgICAgWywyMV0gICAgICBbLDIyXSAgICAgIFssMjNdICAgICAgWywyNF0gICAgICBbLDI1XSAgICAgIFssMjZdICAgICAgWywyN10gICAgICBbLDI4XVxuWzEsXSAwLjA2NjE0ODAyIDAuMDY2MjA2NDIgMC4wNjYzNDkxNSAwLjA2NzMwNTU3IDAuMDY5NDMwNjEgMC4wNjk2NzIzOCAwLjA2OTkyNjgyIDAuMDcwNDQ3MjEgMC4wNzEwNzcxNFxuICAgICAgICAgIFssMjldICAgICAgWywzMF0gICAgICBbLDMxXSAgICAgWywzMl0gICAgICBbLDMzXSAgICAgIFssMzRdICAgICAgWywzNV0gICAgICBbLDM2XSAgICAgIFssMzddXG5bMSxdIDAuMDcxMDkxNDkgMC4wNzE4Mzc5NCAwLjA3MjAzMTY2IDAuMDczNTUyNyAwLjA3MzU4MDE2IDAuMDczNjY5ODcgMC4wNzQ1NTc2MyAwLjA3NTI1ODQ5IDAuMDc1Mjg5MDRcbiAgICAgICAgICBbLDM4XSAgICAgWywzOV0gICAgICBbLDQwXSAgICAgIFssNDFdICAgICBbLDQyXSAgICAgWyw0M10gICAgICBbLDQ0XSAgICAgWyw0NV0gICAgICBbLDQ2XVxuWzEsXSAwLjA3NjU4NzQ3IDAuMDc2OTIyMyAwLjA3NzE5OTAzIDAuMDc3MjcwMjQgMC4wNzc2NDg1IDAuMDc3NzcwNSAwLjA3Nzg0MzE4IDAuMDc3OTIxNSAwLjA3OTE0Njk1XG4gICAgICAgICAgWyw0N10gICAgICBbLDQ4XSAgICAgIFssNDldICAgIFssNTBdXG5bMSxdIDAuMDc5MzYwOTUgMC4wNzk2MzI2NiAwLjA3OTcxMTczIDAuMDgwMDE5XG4ifQ== -->
 [,1]       [,2]       [,3]       [,4]       [,5]      [,6]       [,7]       [,8]       [,9]      [,10]

[1,] 0 0.01298499 0.01884808 0.02887906 0.03280625 0.0430725 0.04396464 0.04598913 0.04629568 0.05261217 [,11] [,12] [,13] [,14] [,15] [,16] [,17] [,18] [,19] [1,] 0.05392856 0.05644298 0.05857679 0.05873977 0.06013186 0.06057235 0.0622033 0.06370118 0.06540917 [,20] [,21] [,22] [,23] [,24] [,25] [,26] [,27] [,28] [1,] 0.06614802 0.06620642 0.06634915 0.06730557 0.06943061 0.06967238 0.06992682 0.07044721 0.07107714 [,29] [,30] [,31] [,32] [,33] [,34] [,35] [,36] [,37] [1,] 0.07109149 0.07183794 0.07203166 0.0735527 0.07358016 0.07366987 0.07455763 0.07525849 0.07528904 [,38] [,39] [,40] [,41] [,42] [,43] [,44] [,45] [,46] [1,] 0.07658747 0.0769223 0.07719903 0.07727024 0.0776485 0.0777705 0.07784318 0.0779215 0.07914695 [,47] [,48] [,49] [,50] [1,] 0.07936095 0.07963266 0.07971173 0.080019




<!-- rnb-output-end -->

<!-- rnb-chunk-end -->


<!-- rnb-text-begin -->


Let's see the corresponding songs.


<!-- rnb-text-end -->


<!-- rnb-chunk-begin -->


<!-- rnb-source-begin eyJkYXRhIjoiYGBgclxuc29uZ3NbYXMubnVtZXJpYyhuZWlnaGJvcnMkbm4uaW5kZXgpLF0gICAgICAgICAgICAgICMgQXBwbHlpbmcgdGhlIGluZGljZXMgdG8gdGhlIGRhdGFzZXRcbmBgYCJ9 -->

```r
songs[as.numeric(neighbors$nn.index),]              # Applying the indices to the dataset
neighbor_names <- songs[as.numeric(neighbors$nn.index),] %>% pull(song_name) # Same + keeping the names only

Visualizing the result

knn_data <- songs %>%                                   # Extracting data in tody format!
    select(var_dist, "song_name") %>%
    pivot_longer(names_to = "attribute", values_to = "value", -song_name)
Note: Using an external vector in selections is ambiguous.
ℹ Use `all_of(var_dist)` instead of `var_dist` to silence this message.
ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
This message is displayed once per session.
knn_data %>%                                            # Plotting
    ggplot(aes(x = attribute, y = value)) + geom_jitter(size = 0.5) +
    geom_jitter(data = knn_data %>% filter(song_name %in% neighbor_names), color = "yellow", size = 2) +
    geom_jitter(data = knn_data %>% filter(song_name == "Poker Face"), color = "red", size = 3) 

The job in incredibly well done!

Ok, but what if we want to include another variable, like tempo? Easy: add it to the list of variables and scale it via mutate()!

var_dist2 <- c("danceability", "energy", "speechiness", "valence", "tempo")  # Adding the tempo
songs2 <- songs %>% mutate(tempo = tempo / 250)                                          # Scaling the tempo
target2 <- target %>% mutate(tempo = tempo / 250)
head(songs2) %>% select(song_name, artist, duration, danceability, tempo)                # Check scale

Ok, we are ready for a second round of k-NN.

neighbors2 <- get.knnx(data = songs2 %>% select(var_dist2),                              # New data source!
                      query = target2 %>% select(var_dist2),                             # Target
                      k = 20,                                                            # Nb of neighbors
                      algorithm = "brute")                                               # Algo type             
Note: Using an external vector in selections is ambiguous.
ℹ Use `all_of(var_dist2)` instead of `var_dist2` to silence this message.
ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
This message is displayed once per session.
neighbor_names2 <- songs2[as.numeric(neighbors2$nn.index),] %>% pull(song_name)

knn_data2 <- songs2 %>% 
    select(var_dist2, "song_name") %>%             
    gather(key = attribute, value = value, -song_name)

knn_data2 %>% 
    ggplot(aes(x = attribute, y = value)) + geom_jitter(size = 0.5) +
    geom_jitter(data = knn_data2 %>% filter(song_name %in% neighbor_names2), color = "yellow", size = 2) +
    geom_jitter(data = knn_data2 %>% filter(song_name == "Poker Face"), color = "red", size = 3) 

Predictive k-NN

Nearest neighbors can be used for prediction purposes. We have used 5 variables to detect proximity. Let’s see if they can help predict the popularity of the song. Let’s compute the average popularity of Poker Face’s neighbors.

# The number we are trying to predict:
songs %>% filter(song_name == "Poker Face") %>% pull(popularity)
[1] 31
songs[as.numeric(neighbors2$nn.index),] %>% 
    pull(popularity) %>%
    mean()
[1] 44.7

Does weighting help improve the forecast?

library(magrittr)
songs[as.numeric(neighbors2$nn.index),] %>% 
    pull(popularity) %>%
    multiply_by(exp(-neighbors2$nn.dist)/mean(exp(-neighbors2$nn.dist))) %>%  # Pipe multiplication!
    mean()
[1] 44.69537

Not really.
What this means is that Poker Face is much less popular than songs that have very similar characteristics.

LS0tCnRpdGxlOiAiUzg6IGsgTmVhcmVzdCBOZWlnaGJvcnMiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCiMgU2V0dGluZyB0aGUgZGF0YQoKRmlyc3QsIGluc3RhbGwvYWN0aXZhdGUgdGhlIHBhY2thZ2VzLi4uIAoKYGBge3IsIG1lc3NhZ2UgPSBGQUxTRSwgd2FybmluZyA9IEZBTFNFfQppZighcmVxdWlyZShGTk4pKXtpbnN0YWxsLnBhY2thZ2VzKCJGTk4iKX0KbGlicmFyeSh0aWR5dmVyc2UpCmxpYnJhcnkoRk5OKQpgYGAKCi4uLiBhbmQgbG9hZCB0aGUgZGF0YS4gKCpkb24ndCBmb3JnZXQgdG8gc2V0IHlvdXIgd29ya2luZyBkaXJlY3RvcnkqKS4KCmBgYHtyfQpsb2FkKCJzb25ncy5SRGF0YSIpCmBgYAoKTmV4dCwgd2UgZml4IHR3byBpbXBvcnRhbnQgdGhpbmdzOiB0aGUgKip0YXJnZXQqKiAoYXJvdW5kIHdoaWNoIHRvIGZpbmQgbmVpZ2hib3JzKSBhbmQgdGhlICoqdmFyaWFibGVzKiogb24gd2hpY2ggdGhlIGRpc3RhbmNlcyB3aWxsIGJlIGNvbXB1dGVkLiAKVGhlIGRhdGFzZXQgaXMgc29uZ3MsIHdlIGNob29zZSB0aGUgdmFyaWFibGVzIHRoYXQgaGF2ZSB0aGUgc2FtZSBzY2FsZSAoZnJvbSAwIHRvIDEpOiBkYW5jZWFiaWxpdHksIGVuZXJneSwgc3BlZWNoaW5lc3MgYW5kIHZhbGVuY2UuCgpgYGB7cn0KdGFyZ2V0IDwtIGZpbHRlcihzb25ncywgc29uZ19uYW1lID09ICJQb2tlciBGYWNlIikKdmFyX2Rpc3QgPC0gYygiZGFuY2VhYmlsaXR5IiwgImVuZXJneSIsICJzcGVlY2hpbmVzcyIsICJ2YWxlbmNlIikKYGBgCgojIFBlcmZvcm1pbmcgay1OTgoKVGhhbmtzIHRvIHRoZSBGTk4gcGFja2FnZSwgd2UgYXJlIHJlYWR5IHRvIGdvISBUaGUgc3ludGF4IGlzIHZlcnkgc2ltcGxlCgpgYGB7cn0KbmVpZ2hib3JzIDwtIGdldC5rbm54KGRhdGEgPSBzb25ncyAlPiUgc2VsZWN0KGFsbF9vZih2YXJfZGlzdCkpLCAgICAjIERhdGEgc291cmNlOiBiZSBjYXJlZnVsIHdpdGggdGhlIGNvbHVtbnMhCiAgICAgICAgICAgICAgICAgICAgICBxdWVyeSA9IHRhcmdldCAlPiUgc2VsZWN0KGFsbF9vZih2YXJfZGlzdCkpLCAgIyBUYXJnZXQgKHdpdGggdGhlIHJpZ2h0IGNvbHVtbnMpCiAgICAgICAgICAgICAgICAgICAgICBrID0gMjApICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIyBOYiBvZiBuZWlnaGJvcnMKYGBgCgpUaGVyZSBhcmUgdHdvIG91dHB1dHMgaW4gKipuZWlnaGJvcnMqKjogdGhlIGluZGV4IG9mIHRoZSBuZWlnaGJvcnMgYW5kIHRoZSBkaXN0YW5jZSB0byB0aGUgdGFyZ2V0LgoKYGBge3J9Cm5laWdoYm9ycyRubi5pbmRleCAgIyBJbmRleCA9IHJvdyBuwrAgaW4gdGhlIGRhdGFzZXQKYGBgCgpgYGB7cn0KbmVpZ2hib3JzJG5uLmRpc3QgICMgVGhlIGNvcnJlc3BvbmRpbmcgZGlzdGFuY2VzIChpbiBkZWNyZWFzaW5nIG9yZGVyIG5hdHVyYWxseSkKYGBgCgpMZXQncyBzZWUgdGhlIGNvcnJlc3BvbmRpbmcgc29uZ3MuCgpgYGB7cn0Kc29uZ3NbYXMubnVtZXJpYyhuZWlnaGJvcnMkbm4uaW5kZXgpLF0gICAgICAgICAgICAgICMgQXBwbHlpbmcgdGhlIGluZGljZXMgdG8gdGhlIGRhdGFzZXQKbmVpZ2hib3JfbmFtZXMgPC0gc29uZ3NbYXMubnVtZXJpYyhuZWlnaGJvcnMkbm4uaW5kZXgpLF0gJT4lIHB1bGwoc29uZ19uYW1lKSAjIFNhbWUgKyBrZWVwaW5nIHRoZSBuYW1lcyBvbmx5CmBgYAoKCiMgVmlzdWFsaXppbmcgdGhlIHJlc3VsdAoKYGBge3J9Cmtubl9kYXRhIDwtIHNvbmdzICU+JSAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIyBFeHRyYWN0aW5nIGRhdGEgaW4gdG9keSBmb3JtYXQhCiAgICBzZWxlY3QodmFyX2Rpc3QsICJzb25nX25hbWUiKSAlPiUKICAgIHBpdm90X2xvbmdlcihuYW1lc190byA9ICJhdHRyaWJ1dGUiLCB2YWx1ZXNfdG8gPSAidmFsdWUiLCAtc29uZ19uYW1lKQprbm5fZGF0YSAlPiUgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICMgUGxvdHRpbmcKICAgIGdncGxvdChhZXMoeCA9IGF0dHJpYnV0ZSwgeSA9IHZhbHVlKSkgKyBnZW9tX2ppdHRlcihzaXplID0gMC41KSArCiAgICBnZW9tX2ppdHRlcihkYXRhID0ga25uX2RhdGEgJT4lIGZpbHRlcihzb25nX25hbWUgJWluJSBuZWlnaGJvcl9uYW1lcyksIGNvbG9yID0gInllbGxvdyIsIHNpemUgPSAyKSArCiAgICBnZW9tX2ppdHRlcihkYXRhID0ga25uX2RhdGEgJT4lIGZpbHRlcihzb25nX25hbWUgPT0gIlBva2VyIEZhY2UiKSwgY29sb3IgPSAicmVkIiwgc2l6ZSA9IDMpIApgYGAKClRoZSBqb2IgaW4gaW5jcmVkaWJseSB3ZWxsIGRvbmUhCgpPaywgYnV0IHdoYXQgaWYgd2Ugd2FudCB0byBpbmNsdWRlIGFub3RoZXIgdmFyaWFibGUsIGxpa2UgKip0ZW1wbyoqPyBFYXN5OiBhZGQgaXQgdG8gdGhlIGxpc3Qgb2YgdmFyaWFibGVzIGFuZCBzY2FsZSBpdCB2aWEgKiptdXRhdGUqKigpIQoKYGBge3J9CnZhcl9kaXN0MiA8LSBjKCJkYW5jZWFiaWxpdHkiLCAiZW5lcmd5IiwgInNwZWVjaGluZXNzIiwgInZhbGVuY2UiLCAidGVtcG8iKSAgIyBBZGRpbmcgdGhlIHRlbXBvCnNvbmdzMiA8LSBzb25ncyAlPiUgbXV0YXRlKHRlbXBvID0gdGVtcG8gLyAyNTApICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIyBTY2FsaW5nIHRoZSB0ZW1wbwp0YXJnZXQyIDwtIHRhcmdldCAlPiUgbXV0YXRlKHRlbXBvID0gdGVtcG8gLyAyNTApCmhlYWQoc29uZ3MyKSAlPiUgc2VsZWN0KHNvbmdfbmFtZSwgYXJ0aXN0LCBkdXJhdGlvbiwgZGFuY2VhYmlsaXR5LCB0ZW1wbykgICAgICAgICAgICAgICAgIyBDaGVjayBzY2FsZQpgYGAKCk9rLCB3ZSBhcmUgcmVhZHkgZm9yIGEgc2Vjb25kIHJvdW5kIG9mICprKi1OTi4KCmBgYHtyfQpuZWlnaGJvcnMyIDwtIGdldC5rbm54KGRhdGEgPSBzb25nczIgJT4lIHNlbGVjdCh2YXJfZGlzdDIpLCAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICMgTmV3IGRhdGEgc291cmNlIQogICAgICAgICAgICAgICAgICAgICAgcXVlcnkgPSB0YXJnZXQyICU+JSBzZWxlY3QodmFyX2Rpc3QyKSwgICAgICAgICAgICAgICAgICAgICAgICAgICAgICMgVGFyZ2V0CiAgICAgICAgICAgICAgICAgICAgICBrID0gMjAsICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIyBOYiBvZiBuZWlnaGJvcnMKICAgICAgICAgICAgICAgICAgICAgIGFsZ29yaXRobSA9ICJicnV0ZSIpICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAjIEFsZ28gdHlwZSAgICAgICAgICAgICAKCm5laWdoYm9yX25hbWVzMiA8LSBzb25nczJbYXMubnVtZXJpYyhuZWlnaGJvcnMyJG5uLmluZGV4KSxdICU+JSBwdWxsKHNvbmdfbmFtZSkKCmtubl9kYXRhMiA8LSBzb25nczIgJT4lIAogICAgc2VsZWN0KHZhcl9kaXN0MiwgInNvbmdfbmFtZSIpICU+JSAgICAgICAgICAgICAKICAgIGdhdGhlcihrZXkgPSBhdHRyaWJ1dGUsIHZhbHVlID0gdmFsdWUsIC1zb25nX25hbWUpCgprbm5fZGF0YTIgJT4lIAogICAgZ2dwbG90KGFlcyh4ID0gYXR0cmlidXRlLCB5ID0gdmFsdWUpKSArIGdlb21faml0dGVyKHNpemUgPSAwLjUpICsKICAgIGdlb21faml0dGVyKGRhdGEgPSBrbm5fZGF0YTIgJT4lIGZpbHRlcihzb25nX25hbWUgJWluJSBuZWlnaGJvcl9uYW1lczIpLCBjb2xvciA9ICJ5ZWxsb3ciLCBzaXplID0gMikgKwogICAgZ2VvbV9qaXR0ZXIoZGF0YSA9IGtubl9kYXRhMiAlPiUgZmlsdGVyKHNvbmdfbmFtZSA9PSAiUG9rZXIgRmFjZSIpLCBjb2xvciA9ICJyZWQiLCBzaXplID0gMykgCmBgYAoKCiMgUHJlZGljdGl2ZSBrLU5OCgpOZWFyZXN0IG5laWdoYm9ycyBjYW4gYmUgdXNlZCBmb3IgcHJlZGljdGlvbiBwdXJwb3Nlcy4gV2UgaGF2ZSB1c2VkIDUgdmFyaWFibGVzIHRvIGRldGVjdCBwcm94aW1pdHkuIExldCdzIHNlZSBpZiB0aGV5IGNhbiBoZWxwIHByZWRpY3QgdGhlIHBvcHVsYXJpdHkgb2YgdGhlIHNvbmcuIExldCdzIGNvbXB1dGUgdGhlIGF2ZXJhZ2UgcG9wdWxhcml0eSBvZiAqUG9rZXIgRmFjZSoncyBuZWlnaGJvcnMuCgpgYGB7cn0KIyBUaGUgbnVtYmVyIHdlIGFyZSB0cnlpbmcgdG8gcHJlZGljdDoKc29uZ3MgJT4lIGZpbHRlcihzb25nX25hbWUgPT0gIlBva2VyIEZhY2UiKSAlPiUgcHVsbChwb3B1bGFyaXR5KQpzb25nc1thcy5udW1lcmljKG5laWdoYm9yczIkbm4uaW5kZXgpLF0gJT4lIAogICAgcHVsbChwb3B1bGFyaXR5KSAlPiUKICAgIG1lYW4oKQpgYGAKCkRvZXMgd2VpZ2h0aW5nIGhlbHAgaW1wcm92ZSB0aGUgZm9yZWNhc3Q/CgpgYGB7ciwgd2FybmluZyA9IEZBTFNFLCBtZXNzYWdlID0gRkFMU0V9CmxpYnJhcnkobWFncml0dHIpCnNvbmdzW2FzLm51bWVyaWMobmVpZ2hib3JzMiRubi5pbmRleCksXSAlPiUgCiAgICBwdWxsKHBvcHVsYXJpdHkpICU+JQogICAgbXVsdGlwbHlfYnkoZXhwKC1uZWlnaGJvcnMyJG5uLmRpc3QpL21lYW4oZXhwKC1uZWlnaGJvcnMyJG5uLmRpc3QpKSkgJT4lICAjIFBpcGUgbXVsdGlwbGljYXRpb24hCiAgICBtZWFuKCkKYGBgCgpOb3QgcmVhbGx5LiAgCldoYXQgdGhpcyBtZWFucyBpcyB0aGF0ICpQb2tlciBGYWNlKiBpcyBtdWNoIGxlc3MgcG9wdWxhciB0aGFuIHNvbmdzIHRoYXQgaGF2ZSB2ZXJ5IHNpbWlsYXIgY2hhcmFjdGVyaXN0aWNzLiAKCmBgYHtyfQoKYGBgCgo=