First, install/activate the packages…
if(!require(FNN)){install.packages("FNN")}
library(tidyverse)
library(FNN)
… and load the data. (don’t forget to set your working directory).
load("songs.RData")
Next, we fix two important things: the target (around which to find neighbors) and the variables on which the distances will be computed. The dataset is songs, we choose the variables that have the same scale (from 0 to 1): danceability, energy, speechiness and valence.
target <- filter(songs, song_name == "In The End")
var_dist <- c("danceability", "energy", "speechiness", "valence")
Thanks to the FNN package, we are ready to go! The syntax is very simple
neighbors <- get.knnx(data = songs %>% select(all_of(var_dist)), # Data source: be careful with the columns!
query = target %>% select(all_of(var_dist)), # Target (with the right columns)
k = 50) # Nb of neighbors
There are two outputs in neighbors: the index of the neighbors and the distance to the target.
neighbors$nn.index # Index = row n° in the dataset
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17] [,18]
[1,] 2 11021 7568 8644 12451 6735 7966 2198 8300 5331 5857 5233 2252 5897 5397 3566 7392 9990
[,19] [,20] [,21] [,22] [,23] [,24] [,25] [,26] [,27] [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35]
[1,] 7009 5457 9786 6640 10545 7593 56 5952 6688 12669 11228 10 4769 12947 1052 1686 9084
[,36] [,37] [,38] [,39] [,40] [,41] [,42] [,43] [,44] [,45] [,46] [,47] [,48] [,49] [,50]
[1,] 7865 12838 6650 7606 1072 5333 3632 5671 5860 3964 7822 9370 9953 11098 9362
neighbors$nn.dist # The corresponding distances (in decreasing order naturally)
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] 0 0.008312039 0.0163282 0.01940515 0.02977734 0.03017018 0.0304959 0.03237731 0.03330165 0.03451087
[,11] [,12] [,13] [,14] [,15] [,16] [,17] [,18] [,19]
[1,] 0.03494582 0.03611149 0.03660546 0.03860622 0.04202428 0.04255585 0.04272657 0.04509157 0.0451792
[,20] [,21] [,22] [,23] [,24] [,25] [,26] [,27] [,28]
[1,] 0.04560493 0.04614109 0.04777196 0.05024301 0.05125466 0.05132251 0.05133469 0.05149214 0.05176765
[,29] [,30] [,31] [,32] [,33] [,34] [,35] [,36] [,37]
[1,] 0.05177152 0.05232246 0.05259553 0.054128 0.05453082 0.05459157 0.05581657 0.05683309 0.05685499
[,38] [,39] [,40] [,41] [,42] [,43] [,44] [,45] [,46]
[1,] 0.05758481 0.05765319 0.05815677 0.05896575 0.05984279 0.06119028 0.06146064 0.06162475 0.06335803
[,47] [,48] [,49] [,50]
[1,] 0.06396007 0.06427161 0.06449031 0.06458181
Let’s see the corresponding songs.
songs[as.numeric(neighbors$nn.index),] # Applying the indices to the dataset
neighbor_names <- songs[as.numeric(neighbors$nn.index),] %>% pull(song_name) # Same + keeping the names only
knn_data <- songs %>% # Extracting data in tody format!
select(all_of(c(var_dist, "song_name"))) %>%
pivot_longer(names_to = "attribute", values_to = "value", -song_name)
knn_data %>% # Plotting
ggplot(aes(x = attribute, y = value)) + geom_jitter(size = 0.5) +
geom_jitter(data = knn_data %>% filter(song_name %in% neighbor_names), color = "yellow", size = 2) +
geom_jitter(data = knn_data %>% filter(song_name == target_name), color = "red", size = 3)
The job in incredibly well done!
Ok, but what if we want to include another variable, like tempo? Easy: add it to the list of variables and scale it via mutate()!
var_dist2 <- c("danceability", "energy", "speechiness", "valence", "tempo") # Adding the tempo
songs2 <- songs %>% mutate(tempo = tempo / 250) # Scaling the tempo
target2 <- target %>% mutate(tempo = tempo / 250)
head(songs2) %>% select(song_name, artist, duration, danceability, tempo) # Check scale
Ok, we are ready for a second round of k-NN.
neighbors2 <- get.knnx(data = songs2 %>% select(var_dist2), # New data source!
Warning messages:
1: Unknown or uninitialised column: `dist`.
2: Unknown or uninitialised column: `dist`.
query = target2 %>% select(var_dist2), # Target
k = 10, # Nb of neighbors
algorithm = "brute") # Algo type
neighbor_names2 <- songs2[as.numeric(neighbors2$nn.index),] %>% pull(song_name)
knn_data2 <- songs2 %>%
select(var_dist2, "song_name") %>%
pivot_longer(-song_name, names_to = "attribute", values_to = "value")
knn_data2 %>%
ggplot(aes(x = attribute, y = value)) + geom_jitter(size = 0.5) +
geom_jitter(data = knn_data2 %>% filter(song_name %in% neighbor_names2), color = "yellow", size = 2) +
geom_jitter(data = knn_data2 %>% filter(song_name == target_name), color = "red", size = 3)
Nearest neighbors can be used for prediction purposes. We have used 5 variables to detect proximity. Let’s see if they can help predict the popularity of the song. Let’s compute the average popularity of the target’s neighbors.
# The number we are trying to predict:
songs %>% filter(song_name == target_name) %>% pull(popularity)
[1] 66
songs[as.numeric(neighbors2$nn.index),] %>%
pull(popularity) %>%
mean()
[1] 50
Does weighting help improve the forecast?
library(magrittr)
Warning message:
Unknown or uninitialised column: `dist`.
songs[as.numeric(neighbors2$nn.index),] %>%
pull(popularity) %>%
multiply_by(exp(-neighbors2$nn.dist)/mean(exp(-neighbors2$nn.dist))) %>% # Pipe multiplication!
mean()
[1] 50.08815
Not really.
What this means is that the target is much less/more popular than songs that have very similar characteristics.
neighbors2 <- get.knnx(data = songs2 %>% select(var_dist2), # New data source!
Warning messages:
1: Unknown or uninitialised column: `dist`.
2: Unknown or uninitialised column: `dist`.
query = target2 %>% select(var_dist2), # Target
k = 13070, # Nb of neighbors
algorithm = "brute") # Algo type
neighbor_names2 <- songs2[as.numeric(neighbors2$nn.index),] %>% pull(song_name)
knn_data2 <- songs2
knn_data2$dist[neighbors2$nn.index] <- neighbors2$nn.dist
Unknown or uninitialised column: `dist`.
knn_data2 <- knn_data2 %>%
select(var_dist2, "song_name", "dist") %>%
pivot_longer(-c("song_name", "dist"), names_to = "attribute", values_to = "value")
knn_data2_f <- knn_data2 %>%
arrange(dist) %>%
head(1000)
knn_data2 %>%
ggplot(aes(x = attribute, y = value)) + geom_jitter(size = 0.5) +
geom_jitter(data = knn_data2_f %>%
filter(song_name %in% neighbor_names2, song_name != "Poker Face"),
aes(color = dist), size = 1) +
geom_jitter(data = knn_data2_f %>% filter(song_name == "Poker Face"), color = "red", size = 3) +
scale_colour_distiller(palette = "Spectral", direction = +1, values = c(0, 0.8, 1))