Spotify Stream History Analyis

Author

Javier Artiga (forked from Natalia Ciria)

Published

January 5, 2025

Set-up

Code
# Set-up parameters
threshold_year<-2018 # Exclude older years from some anlaysis
save_csv <- TRUE # Export stream data frame as csv
save_svg <- TRUE # Export ggplots as svg files

# Required libraries

library(jsonlite) # Working with JSON data
library(dplyr) # Data transformation
library(tidyr) # Data cleaning
library(lubridate) # Handling dates and times
library(ggplot2) # Plot graphs
library(knitr) # Report formatting
if(save_svg) library(svglite) # Create SVG files

# Colour palette
pal<- c("#3abdaa","#7b2458","#facd00","#41658a","#e63946","#b2a3b5","#264653")

Data preparation

Code
# Find json streaming history files
stream_files<-list.files("input_files/", pattern="Streaming_History_Audio")

#Merge all streaming history files into a data frame
#Read and parse playlist data from the first JSON file
stream<-fromJSON(paste0("input_files/",stream_files[1]), flatten = TRUE)
#loop all
for(i in 2:length(stream_files)){
  stream_i <- fromJSON(paste0("input_files/",stream_files[i]), flatten = TRUE)
  stream<-merge(stream, stream_i, all = TRUE)
}
Code
# Process Spotify streaming data
stream <- stream %>%
  mutate(
    # Convert timestamp and rename metadata columns
    date = as_datetime(ts, tz = "UTC"),
    artist_name = master_metadata_album_artist_name,
    track_name = master_metadata_track_name,
    album_name = master_metadata_album_album_name,

    # Categorize device types
    Device = case_when(
      grepl("Android-tablet|Android OS|android|iOS|ios", platform) ~ "Mobile",
      grepl("public_js|web_player|WebPlayer|chrome|Windows|windows|Linux|linux", platform) ~ "PC",
      grepl("cast|tv", platform) ~ "TV",
    ),

    # Categorize track end reasons
    End = case_when(
      reason_end %in% c("trackdone", "endplay") ~ "Track finished",
      reason_end == "logout" ~ "Spotify closed",
      reason_end == "playbtn" ~ "Play button",
      reason_end == "fwdbtn" ~ "Forward button",
      reason_end == "backbtn" ~ "Backward button",
      .default = "Other"
    ),

    # Categorize track start reasons
    Start = case_when(
      reason_start %in% c("clickrow", "click-row") ~ "Selected",
      reason_start == "trackdone" ~ "Track finished",
      reason_start == "persisted" ~ "Persisted",
      reason_start == "playbtn" ~ "Play button",
      reason_start == "fwdbtn" ~ "Forward button",
      reason_start == "backbtn" ~ "Backward button",
      .default = "Other"
    )
  )

# Save processed data if save_csv is TRUE
if(save_csv) write.csv(stream, "output_files/stream.csv")

Annual Spotify hours

Total

Code
# Calculate total listening hours by year
by_year <- stream %>%
  filter(year(date)>=threshold_year) %>% # Filter data from threshold year onwards
  group_by(year = as.factor(year(date))) %>%
  summarise(h_played = sum(ms_played)/1000/60/60,
            n_tracks=n(),
            n_songs=n_distinct(track_name),
            n_artists=n_distinct(artist_name))

# Display table
by_year%>%
  transmute(
    Year=year,
    `Hours played`=round(h_played,2),
    `N tracks`= n_tracks,
    `N songs` = n_songs,
    `N artists` = n_artists
  )%>%
kable(caption="Spotify listened by year")
Spotify listened by year
Year Hours played N tracks N songs N artists
2018 304.21 5741 3108 1023
2019 296.46 6081 2801 856
2020 595.35 10926 5431 2037
2021 470.17 9460 3818 1528
2022 411.07 8286 3583 1695
2023 377.25 10017 4804 1670
2024 799.84 17293 5226 2035

By device

Code
# Calculate total listening hours by year and device
by_device <- stream %>%
  filter(year(date)>=threshold_year) %>% # Filter data from threshold year onwards
  group_by(Device, year = as.factor(year(date))) %>%
  summarise(h_played = sum(ms_played)/1000/60/60)

# Create stacked bar chart of listening hours by device and year
plot_device<-by_device%>%
  ggplot(aes(x = year, y = h_played, fill = Device)) +
  geom_col(position = position_stack(reverse = TRUE)) +  # Stack bars with reverse order
  scale_fill_manual(values = pal, na.value = "grey80") + # Custom color palette
  theme_minimal() +
  ggtitle("Hours listening to Spotify") +
  ylab("")

# Display plot
plot_device

Code
# Save plot
if(save_svg) ggsave(paste0("plot_device",".svg"), path = "output_files/")

Shuffle mode

Code
# Calculate tracks played in shuffle mode by year
by_shuffle <- stream %>%
  filter(year(date)>=threshold_year) %>% # Filter data from threshold year onwards
  group_by(year = as.factor(year(date)), shuffle) %>%
  summarise(n = n()) %>%
  mutate(`Proportion of tracks` = n / sum(n),
         Shuffle = ifelse(shuffle, "Yes", "No"))

# Create stacked bar chart of tracks played in shuffle mode by year
plot_shuffle<-by_shuffle%>%
  ggplot(aes(x = year, y = `Proportion of tracks`, fill=Shuffle)) +
  geom_col() +
  scale_fill_manual(values = pal, na.value = "grey80") +
  theme_minimal() +

  scale_y_continuous(labels = function(x) paste0(x*100, "%")) + # Convert y-axis to percentages
  ggtitle("Proportion of tracks listened in shuffle mode") +
  ylab("")

# Display plot
plot_shuffle

Code
# Save plot
if(save_svg) ggsave(paste0("plot_shuffle",".svg"), path = "output_files/")

Track start and end

Code
# Create consistent palettes for track start and end visualization
# Sort unique end and start values
names_end <- sort(unique(stream$End), decreasing = TRUE)
names_start <- sort(unique(stream$Start), decreasing = TRUE)

# Create palette for end values
pal_end <- pal[1:length(names_end)]
names(pal_end) <- names_end

# Find overlapping colors between start and end
pal_start_end <- pal_end[names_end %in% names_start]

# Create palette for start values
pal_start <- pal[!pal %in% pal_start_end]
names(pal_start) <- names_start[!names_start %in% names_end]

# Combine palettes
pal_start <- c(pal_start_end, pal_start)
Code
# Create a stacked bar chart showing how tracks started (shuffled vs non-shuffled)
by_reason_start_shuffle <- stream %>%
  filter(year(date)>=threshold_year) %>%
  mutate(
    Shuffle=ifelse(shuffle,"In shuffle mode","Not in shuffle mode")) %>%
  group_by(year = as.factor(year(date)),`Start`, Shuffle) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  arrange(desc(n))

plot_reason_start_shuffle<-by_reason_start_shuffle%>%
  ggplot(aes(x = year, y = n, fill=`Start`)) +
  geom_col(position="fill") +
  facet_grid(cols=vars(Shuffle)) +  # Split by shuffle mode
  scale_fill_manual(values = pal_start) +
  theme_minimal() +
  scale_y_continuous(labels = function(x) paste0(x*100, "%")) +
  ggtitle("How tracks started") +
  ylab("Proportion of tracks") +
  theme(legend.title=element_blank())

# Display plot
plot_reason_start_shuffle

Code
# Save plot
if(save_svg) ggsave(paste0("plot_reason_start_shuffle",".svg"), path = "output_files/")
Code
# Calculate number of tracks listenede by year and suffle mode
by_reason_end_shuffle <- stream %>%
  filter(year(date)>=threshold_year) %>%
  mutate(
    Shuffle=ifelse(shuffle,"In shuffle mode","Not in shuffle mode")) %>%
  group_by(year = as.factor(year(date)),`End`, Shuffle) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  arrange(desc(n))

# Create a stacked bar chart showing how tracks ended (shuffled vs non-shuffled)
plot_reason_end_shuffle<-by_reason_end_shuffle%>%
  ggplot(aes(x = year, y = n, fill=`End`)) +
  geom_col(position="fill") +
  facet_grid(cols=vars(Shuffle)) +  # Split by shuffle mode
  scale_fill_manual(values = pal_end) +
  theme_minimal() +
  scale_y_continuous(labels = function(x) paste0(x*100, "%")) +
  ggtitle("How tracks ended") +
  ylab("Proportion of tracks") +
  theme(legend.title=element_blank())

# Display plot
plot_reason_end_shuffle

Code
# Save plot
if(save_svg) ggsave(paste0("plot_reason_end_shuffle",".svg"), path = "output_files/")

Listening peaks

Hours per day calendar

Code
# Create a dataframe with all days in the date range
date_range <- stream %>%
  filter(year(date)>=threshold_year) # Filter data from threshold year onwards

all_days <- data.frame(
  day_date = seq(as.Date(floor_date(min(date_range$date), unit="year")),
                as.Date(ceiling_date(max(date_range$date), unit="year")-1),
                by="days")) %>%
  mutate(day_n = row_number())

# Calculate first weekday of each year
first_week_day <- all_days %>%
  mutate(weekday = as.POSIXlt(day_date)$wday,
         weekday = ifelse(weekday==0, 7, weekday),
         year = year(day_date)) %>%
  group_by(year) %>%
  summarize(first_week_day = first(weekday)-1,
            days_year = n())

# Process streaming data by day
by_day_year <- date_range %>%
  mutate(day_date = date(date_range$date)) %>%
  right_join(all_days) %>%
  group_by(day_date, day_n) %>%
  # Calculate daily metrics
  summarise(
    h_played = sum(ms_played)/1000/60/60,  # Convert ms to hours
    n_tracks = n(),
    computer = mean(Device=="Computer", na.rm=TRUE),
    shuffle = mean(shuffle, na.rm=TRUE),
    incognito_mode = mean(incognito_mode, na.rm=TRUE),
    offline = mean(offline, na.rm=TRUE),
  ) %>%
  # Add calendar columns
  mutate(
    h_played = ifelse(is.na(h_played), 0, h_played),
    year = year(day_date),
    month = month(day_date),
    week = week(day_date),
    day_year = yday(day_date),
    weekday = as.POSIXlt(day_date)$wday,
    weekday = factor(ifelse(weekday==0, 7, weekday), levels=c(1:7))
  ) %>%
  left_join(first_week_day) %>%
  mutate(
    calendar_row = ceiling((first_week_day + day_year)/7)
  )

# Create calendar heatmap
plot_day_year<-by_day_year %>%
  ggplot(aes(x = weekday, y = -calendar_row, fill=h_played)) +
  geom_tile() +
  facet_grid(cols=vars(year), rows=vars(month), scales = "free") +
  theme_void() +
  scale_fill_gradientn(colors=c("grey95", pal[c(1,3,2)]), na.value ="grey95") +
  ggtitle("Hours listened per day") +
  theme(legend.title=element_blank()) +
  ylab("")

# Display plot
plot_day_year

Code
# Save plot if save_svg is TRUE
if(save_svg) ggsave(paste0("plot_day_year",".svg"), width=7, height=8, path="output_files/")

Top days

Code
# Create summary of number of times a track was listened in a day
date_range <- stream %>%
  filter(year(date)>=threshold_year) # Filter data from threshold year onwards

by_top_track_day <- date_range %>%
  mutate(day_date = date(date_range$date)) %>%
  group_by(track_name, day_date) %>%
  summarise(n_tracks = n())%>%
  ungroup() %>%
  arrange(desc(n_tracks))

# Get top 10 days by hours played
by_top_day <- date_range %>%
  mutate(day_date = date(date_range$date)) %>%
  group_by(day_date) %>%
  summarise(
    n_tracks_all = n(),
    computer = mean(Device == "Computer"),
    shuffle = mean(shuffle),
    h_played = sum(ms_played)/1000/60/60
  ) %>%
  ungroup() %>%
  arrange(desc(h_played)) %>%
  slice_max(h_played, n = 10)

# Join the data and calculate final metrics
by_top_day <-by_top_day %>%
  left_join(by_top_track_day) %>%
  group_by(day_date) %>%
  summarize(mult_times = mean(n_tracks > 1)) %>%
  left_join(by_top_day) %>%
  arrange(desc(h_played)) %>%
  relocate(day_date, h_played, n_tracks = n_tracks_all)

# Display table
by_top_day%>%
  transmute(
    Date=day_date,
    Hours=round(h_played,2),
    `N tracks`=n_tracks,
    `% tracks replayed`=paste0(round(mult_times,4)*100, "%"))%>%
  kable(caption="Top 10 days by listening hours")
Top 10 days by listening hours
Date Hours N tracks % tracks replayed
2024-01-28 16.16 272 58.82%
2024-01-29 10.34 187 37.11%
2021-11-23 10.27 205 33.85%
2024-01-13 10.11 184 21.53%
2023-12-16 9.34 243 15.12%
2024-01-12 8.64 156 6.21%
2024-01-15 8.32 172 13.48%
2024-03-25 7.95 155 16.79%
2019-04-12 7.62 127 37.65%
2020-05-07 7.56 149 19.2%

Minutes listened by hour of the day

Code
# Create a dataframe with all possible hours for each day
all_hours <- all_days %>% expand(day_date, hour=1:24)

date_range <- stream %>%
  filter(year(date)>=threshold_year) # Filter data from threshold year onwards

# Calculate average listening time by hour and year
by_hour_year <- date_range %>%
  mutate(day_date = date(date_range$date),
         hour = hour(date)) %>%
  right_join(all_hours) %>%
  group_by(day_date, hour) %>%
  mutate(ms_played = ifelse(is.na(ms_played), 0, ms_played)) %>%
  summarise(min_played = sum(ms_played/1000/60)) %>%
  group_by(year = year(day_date), hour) %>%
  summarise(min_played = mean(min_played))


# Create visualization of average listening time by hour and year
plot_hour_year <- by_hour_year %>%
  ggplot(aes(x = year, y = hour, fill = min_played)) +
  geom_tile() +
  theme_void() +
  facet_grid(cols = vars(year), rows = vars(hour), scales = "free") +
  scale_fill_gradientn(colors = c("grey95", pal[c(1,3,2)])) +
  ggtitle("Minutes listened per hour of the day (on average)") +
  theme(legend.title = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank())

# Display plot
plot_hour_year

Code
# Save plot
if(save_svg) ggsave(paste0("plot_hour_year",".svg"), width = 7, height = 5, path = "output_files/")

What have I listened to in Spotify?

Top artists

Code
# Calculate statistics by year
by_year <- stream %>%
  filter(year(date)>=threshold_year) %>% # Filter data from threshold year onwards
  group_by(year = year(date)) %>%
  summarise(
    min_played_year = sum(ms_played)/1000/60,
    n_tracks_year = n(),
    n_songs_year = n_distinct(track_name)
  )

# Calculate statistics by artist and year
by_artist_year <- stream %>%
  filter(year(date)>=threshold_year) %>% # Filter data from threshold year onwards
  group_by(artist_name, year = year(date)) %>%
  summarise(
    min_played = sum(ms_played/1000/60),
    n_tracks = n(),
    n_songs = n_distinct(track_name),
    p_tracks = n()/sum(n_tracks)
  ) %>%
  # Split multiple artists into separate rows
  separate_wider_delim(
    artist_name,
    ", ",
    names = paste0("artist_name", 1:5),
    too_few = "align_start",
    too_many = "drop"
  ) %>%
  pivot_longer(
    cols = starts_with("artist_name"),
    names_to = "artist_name_n",
    values_to = "artist_name"
  ) %>%
  left_join(by_year) %>%
  filter(!is.na(artist_name)) %>%
  group_by(artist_name, year) %>%
  summarise(
    min_played = sum(min_played),
    n_tracks = sum(n_tracks),
    n_songs = sum(n_songs),
    p_tracks = n_tracks/n_tracks_year,
    p_min_played = min_played/min_played_year,
    p_songs = n_songs/n_songs_year
  ) %>%
  arrange(year,desc(n_tracks))

by_artist_year %>%
  group_by(Year = year) %>%
  slice_max(n_tracks, n = 3)%>%
  transmute(
    Artist = artist_name,
    `Minutes played`= round(min_played,2),
    `N tracks` = n_tracks,
    `N songs` = n_songs,
    `% of the annual time` = paste0(round(p_min_played,4)*100, "%"),
    `% of the annual tracks` = paste0(round(p_tracks,4)*100, "%"),
    `% of the annual songs` = paste0(round(p_songs,4)*100, "%"),
  )%>%
  kable(caption = "Top 3 artists by year")
Top 3 artists by year
Year Artist Minutes played N tracks N songs % of the annual time % of the annual tracks % of the annual songs
2018 Ayax y Prok 1239.58 453 44 6.79% 7.89% 1.42%
2018 Arctic Monkeys 590.24 187 60 3.23% 3.26% 1.93%
2018 Bejo 378.36 169 24 2.07% 2.94% 0.77%
2019 George Ezra 871.16 326 31 4.9% 5.36% 1.11%
2019 Ayax y Prok 623.96 237 43 3.51% 3.9% 1.54%
2019 Marea 782.26 216 42 4.4% 3.55% 1.5%
2020 Mac Quayle 1211.13 421 163 3.39% 3.85% 3%
2020 Bejo 917.55 374 39 2.57% 3.42% 0.72%
2020 Mac Miller 821.91 210 60 2.3% 1.92% 1.1%
2021 Ayax y Prok 1108.59 398 41 3.93% 4.21% 1.07%
2021 C. Tangana 887.91 383 39 3.15% 4.05% 1.02%
2021 Kase.O 695.49 253 33 2.47% 2.67% 0.86%
2022 Bad Bunny 841.25 281 30 3.41% 3.39% 0.84%
2022 Estopa 794.53 270 65 3.22% 3.26% 1.81%
2022 Ayax y Prok 581.29 238 45 2.36% 2.87% 1.26%
2023 Los gandules 574.14 431 124 2.54% 4.3% 2.58%
2023 Marea 997.79 336 49 4.41% 3.35% 1.02%
2023 Extremoduro 433.73 184 78 1.92% 1.84% 1.62%
2024 La Maravillosa Orquesta del Alcohol 1073.94 443 33 2.24% 2.56% 0.63%
2024 Estopa 1058.58 404 43 2.21% 2.34% 0.82%
2024 Arde Bogotá 825.37 283 26 1.72% 1.64% 0.5%
Code
# Get top 40 artists by year
top_artist_year <- by_artist_year %>%
  ungroup() %>%
  slice_max(n_tracks, n = 40) %>%
  group_by(artist_name) %>%
  summarise(n_tracks_all = sum(n_tracks),
            min_played_all = sum(min_played),
            n_songs_all=sum(n_songs)) %>%
  select(artist_name,min_played_all, n_tracks_all, n_songs_all) %>%
  distinct()

# Calculate statistics by artist
by_artist_all <- stream %>%
  filter(year(date)>=threshold_year) %>% # Filter data from threshold year onwards
  # Split multiple artists into separate rows
  separate_wider_delim(
    artist_name,
    ", ",
    names = paste0("artist_name", 1:5),
    too_few = "align_start",
    too_many = "drop"
  ) %>%
  pivot_longer(
    cols = starts_with("artist_name"),
    names_to = "artist_name_n",
    values_to = "artist_name"
  ) %>%
  group_by(artist_name) %>%
  summarise(
    min_played_all = sum(ms_played/1000/60),
    n_tracks_all = n(),
    n_songs_all = n_distinct(track_name),
    p_tracks_all = n()/sum(n())
  ) %>%
  filter(!is.na(artist_name)) %>%
  arrange(desc(n_tracks_all))

# Get top 40 artists overall
top_artist_all<-by_artist_all %>%
  slice_max(n_tracks_all, n = 40) %>%
  select(artist_name,min_played_all, n_tracks_all, n_songs_all)


# Display table
top_artist_all[1:10,]%>%
  transmute(Artist=artist_name,
            `Minutes played`= round(min_played_all,2),
            `N tracks` = n_tracks_all,
            `N songs` = n_songs_all)%>%
  kable(caption = "Top 10 artists (all years)")
Top 10 artists (all years)
Artist Minutes played N tracks N songs
Ayax y Prok 4355.56 1661 132
C. Tangana 2735.39 1145 71
Bejo 2553.25 1131 68
La Maravillosa Orquesta del Alcohol 2620.12 1018 118
Arctic Monkeys 2693.23 965 109
Estopa 2510.11 929 99
Los gandules 1329.98 879 155
Marea 2812.18 847 95
Extremoduro 2772.74 835 123
Kase.O 2306.25 807 79
Code
# Get top 40 artists overall or by year
top_artist_global<-unique(c(top_artist_year$artist_name, top_artist_all$artist_name))


# Join top artists data and filter top artists
by_top_artist <- by_artist_year %>%
  filter(artist_name%in%top_artist_global) %>%
  left_join(by_artist_all) %>%
  arrange(desc(n_tracks))

Global

Code
# Create a heatmap (not filtering top_artist)
plot_top_artist_year<-by_top_artist%>%
  ggplot(aes(x = year,
             y = reorder(artist_name, -n_tracks_all),
             fill = n_tracks)) +
  geom_tile() +
  theme_void() +
    facet_grid(cols = vars(year),
             rows = vars(reorder(artist_name, -n_tracks_all)),
             scales = "free") +
  scale_fill_gradientn(colors = c("grey95", pal[c(1,3,2)])) +
  ggtitle("Times listened to most played artists") +
  theme(legend.title = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank())

# Display plot
plot_top_artist_year

Code
# Save plot
if(save_svg) ggsave(paste0("plot_top_artist_year",".svg"), width = 7, height = 9, path = "output_files/")

Global + Top 10 by year

Code
# Get top 10 artists for each year
top_artist_year_partial <- by_artist_year %>%
  group_by(year) %>%
  slice_max(n_tracks, n = 10) %>%
  group_by(artist_name) %>%
  summarise(n_tracks_all = sum(n_tracks)) %>%
  select(artist_name) %>%
  distinct()

# Combine different top artist lists
top_artist_all <- by_artist_year %>%
  group_by(artist_name) %>%
  summarise(n_tracks_all = sum(n_tracks)) %>%
  slice_max(n_tracks_all, n = 40) %>%
  select(artist_name, n_tracks_all) %>%
  bind_rows(top_artist_year) %>%
  bind_rows(top_artist_year_partial) %>%
  select(artist_name) %>%
  distinct()
Code
# Prepare data for matrix visualization
by_top_artist_matrix <- by_artist_year %>%
  right_join(top_artist_all) %>%
  select(artist_name, year, p_tracks) %>%
  arrange(year, desc(p_tracks))

# Create wide format matrix
# (this is redundant, but I was stuck geting the artist order right)
matrix <- tidyr::pivot_wider(
  data = by_top_artist_matrix,
  id_cols = artist_name,
  names_from = year,
  values_from = p_tracks
)

# Set ordering for visualization
row_order <- matrix$artist_name
col_order <- names(matrix)[-1]

# Create heatmap visualization
plot_top_artist_matrix <- by_top_artist_matrix %>%
  mutate(
    n_row = row_number(),
    year = factor(year, levels = col_order),
    artist_name = factor(artist_name, levels = row_order)
  ) %>%
  ggplot(aes(x = year, y = artist_name, fill = p_tracks)) +
  geom_tile(position = "identity") +
  theme_void() +
  facet_grid(
    cols = vars(year),
    rows = vars(artist_name),
    scales = "free"
  ) +
  scale_fill_gradientn(colors = c("grey95", pal[c(1,3,2)])) +
  ggtitle("Times listened to most played artists") +
  theme(
    legend.title = element_blank(),
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank()
  )

# Display plot
plot_top_artist_matrix

Code
# Save plot
if(save_svg) ggsave(paste0("plot_top_artist_matrix", ".svg"), width = 8, height = 15, path = "output_files/")

Top tracks

Most listened

Code
# Calculate total play time and count for each track
top_track <- stream %>%
  filter(year(date)>=threshold_year) %>% # Filter data from threshold year onwards
  group_by(track_name, artist_name) %>%
  summarise(
    min_played = sum(ms_played/1000/60),  # Convert ms to minutes
    n_tracks = n()
  ) %>%
  arrange(desc(n_tracks))%>%
  ungroup()

# Display table
top_track[1:10,]%>%
  na.omit(transmute(
    `Track` = track_name,
    `Artist` = artist_name,
    `Minutes played` = round(min_played,2),
    `N tracks` = `n_tracks`
  ))%>%
  kable(caption="Top 10 tracks")
Top 10 tracks
track_name artist_name min_played n_tracks
Mentecato Bejo 311.1535 150
Tú Me Dejaste De Querer C. Tangana 343.6406 132
Crîtto de lâ Nabahâ Califato ¾ 369.8610 120
Hablar, Hablar, Hablar… Los Zigarros 297.4445 110
Fresas con nata Ayax y Prok 334.4053 101
Watch Out The Big Push 298.5197 99
No Gires Varry Brava 195.7154 96
Como Camarón Estopa 284.7042 92
Neverita Bad Bunny 203.5799 92
Code
# Calculate statistics by track and year
by_track_year <- stream %>%
  filter(year(date)>=threshold_year) %>% # Filter data from threshold year onwards
  group_by(track_name, year = year(date)) %>%
  summarise(
    min_played = sum(ms_played/1000/60),
    n_tracks = n()
  )

# Get top 30 tracks by year
top_track_year <- by_track_year %>%
  ungroup() %>%
  slice_max(n_tracks, n = 30) %>%
  group_by(track_name) %>%
  summarise(
    n_tracks_all = sum(n_tracks),
    min_played_all = sum(min_played)
  ) %>%
  select(track_name, n_tracks_all, min_played_all) %>%
  distinct()

# Get top 30 tracks overall
top_track_all <- by_track_year %>%
  group_by(track_name) %>%
  summarise(
    n_tracks_all = sum(n_tracks),
    min_played_all = sum(min_played)
  ) %>%
  slice_max(n_tracks_all, n = 30) %>%
  select(track_name, n_tracks_all, min_played_all) %>%
  bind_rows(top_track_year) %>%
  na.omit(distinct())


# Create heatmap visualization
plot_track_year<-by_track_year %>%
  right_join(top_track_all) %>%
  arrange(n_tracks) %>%
  ggplot(aes(x = year, y = reorder(track_name, -n_tracks_all), fill = n_tracks)) +
  geom_tile() +
  theme_void() +
  facet_grid(
    cols = vars(year),
    rows = vars(reorder(track_name, -n_tracks_all)),
    scales = "free"
  ) +
  scale_fill_gradientn(colors = c("grey95", pal[c(1,3,2)])) +
  ggtitle("Times listened to most played tracks") +
  theme(
    legend.title = element_blank(),
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank()
  )

# Display plot
plot_track_year

Code
# Save plot
if(save_svg) ggsave(paste0("plot_track_year",".svg"), width = 7, height = 9, path = "output_files/")

Manually selected tracks

Code
# Filter and summarize selected tracks
by_track_select <- stream %>%
  filter(year(date)>=threshold_year) %>% # Filter data from threshold year onwards
  # Get only selected tracks that have a name
  filter(Start=="Selected", !is.na(track_name)) %>%
  group_by(track_name, artist_name) %>%
  summarise(
    n_selected = n()
  ) %>%
  ungroup() %>%
  # Join with top tracks data
  left_join(top_track) %>%
  # Calculate percentage of times track was selected
  mutate(
    p_selected = n_selected/n_tracks
  ) %>%
  # Get top 20 most selected tracks
  slice_max(n_selected, n = 20) %>%
  arrange(desc(n_selected))

# Display table
by_track_select %>%
  transmute(
    `Track` = track_name,
    `Artist` = artist_name,
    `Times Selected` = n_selected,
    `% Selected` = paste0(round(p_selected,4)*100,"%"),
    `N tracks`= `n_tracks`
  ) %>%
  kable(caption="Top selected tracks")
Top selected tracks
Track Artist Times Selected % Selected N tracks
Mentecato Bejo 24 16% 150
Nuestra fosa Marea 16 25.81% 62
Demasiadas Mujeres C. Tangana 15 16.48% 91
Es miércoles Spotify 14 56% 25
Moscow Mule Bad Bunny 14 25.93% 54
Rizando el rizo Ayax y Prok 13 22.41% 58
Tú Me Dejaste De Querer C. Tangana 13 9.85% 132
Cada vez cadáver Fito y Fitipaldis 12 18.18% 66
Café solo Ayax y Prok 12 17.91% 67
Es lunes Spotify 12 41.38% 29
Fresas con nata Ayax y Prok 12 11.88% 101
Think About Things Daði Freyr 12 30% 40
When You Were Young The Killers 12 18.46% 65
Buena muerte Marea 11 25% 44
Dentro De La Ley Los Zigarros 11 22% 50
Dispárame Los Zigarros 11 28.21% 39
Es viernes Spotify 11 42.31% 26
Mañana Es Lunes Rubén Pozo 11 28.95% 38
Renacimiento Kase.O 11 12.79% 86
Teddy Picker Arctic Monkeys 11 15.94% 69
What’s the Use? Mac Miller 11 25.58% 43
Workingman’s Blues #2 Bob Dylan 11 55% 20

In loop

Times played in one day

Code
# Display table
by_top_track_day[1:10,]%>%
  na.omit(transmute(Date=day_date,
            Track=track_name,
            `N tracks`=n_tracks))%>%
kable(caption="Top 10 songs by times played in one day")
Top 10 songs by times played in one day
track_name day_date n_tracks
La Danza 2024-08-09 45
Como Camarón 2024-01-28 16
Watch Out 2024-01-28 16
Crystal Fighters 2024-01-28 15
Crîtto de lâ Nabahâ 2024-01-28 15
Decidí - Versión 2004 2024-01-28 15
Héroes del Sábado 2024-01-28 15
Code
# Find tracks that were played more than 5 times in a day
# and count how many days this happened
by_top_track_loop<-by_top_track_day%>%
  filter(n_tracks>5)%>%
  group_by(track_name)%>%
  summarise(
    n_days=n()
  )%>%
  ungroup() %>%
  arrange(desc(n_days))%>%
  filter(n_days>2,!is.na(track_name))

# Display table
by_top_track_loop%>%
  transmute(
    `Track`=track_name,
    `Days in loop (times played>5)`=n_days)%>%
  kable(caption="Tracks played more than 5 times several days")
Tracks played more than 5 times several days
Track Days in loop (times played>5)
Como Camarón 3
Ojalá 3

Playlists and Library tracks

Set up

Code
# Find json streaming history files
playlist_files<-list.files("input_files/", pattern="Playlist")

#Merge all playlist files into a data frame
#Read and parse playlist data from the first JSON file
playlist<-fromJSON(paste0("input_files/",playlist_files[1]), flatten=TRUE)[[1]]

playlist_tracks <- data.frame()

# Loop through each playlist and extract tracks
for(i in 1:nrow(playlist)){
  playlist_tracks_i <- playlist$items[[i]]
  playlist_tracks_i$playlist_name <- playlist$name[i]
  playlist_tracks <- bind_rows(playlist_tracks, playlist_tracks_i)
}

# Read library data from JSON file
library <- fromJSON("input_files/YourLibrary.json", flatten = TRUE)
library_tracks <- library$tracks%>%
  mutate(artist_name = artist,
         track_name = track,
         album_name = album)
library_albums <- library$album%>%
  mutate(artist_name = artist,
         album_name = album)
library <- merge(library$tracks, library$album, all = TRUE)

User playlists

Code
# Process playlist tracks data
playlist_tracks <- playlist_tracks %>%
  mutate(
    added_date = as_datetime(addedDate, tz = "UTC"),
    artist_name = track.artistName,
    track_name = track.trackName,
    album_name = track.albumName
  )

# Get most common tracks in playlists
playlist_top_tracks <- playlist_tracks %>%
  group_by(track_name,artist_name) %>%
  summarise(n_playlists = n()) %>%
  arrange(desc(n_playlists))%>%
  ungroup()

# Display table
playlist_top_tracks[1:10,]%>%
  transmute(
    `Track`=track_name,
    `Artist` = artist_name,
    `N playlists` = n_playlists,
  )%>%
  kable(caption="Tracks included in more playlists")
Tracks included in more playlists
Track Artist N playlists
Mentecato BEJO 15
Hablar, Hablar, Hablar… Los Zigarros 14
Fluorescent Adolescent Arctic Monkeys 11
Querría El Kanka 11
Renacimiento Kase.O 11
1932 La Maravillosa Orquesta del Alcohol 10
Brianstorm Arctic Monkeys 10
Como Camarón Estopa 10
Héroes del Sábado La Maravillosa Orquesta del Alcohol 10
Madrid Karavana 10
Code
# Get most common artists in playlists
playlist_top_artist<-playlist_tracks%>%
  group_by(artist_name)%>%
  summarise(n_tracks=n(),
            n_songs=n_distinct(track_name),
            n_playlists=n_distinct(playlist_name))%>%
  arrange(desc(n_playlists))%>%
  ungroup()

# Display table
playlist_top_artist[1:10,]%>%
  transmute(
    `Artist` = artist_name,
    `N tracks` = n_tracks,
    `N songs` = n_songs,
    `N playlists` = n_playlists,
  )%>%
  kable(caption="Artists included in more playlists")
Artists included in more playlists
Artist N tracks N songs N playlists
Los Zigarros 117 39 21
Arctic Monkeys 179 68 19
Kase.O 130 44 19
BEJO 182 44 18
La Maravillosa Orquesta del Alcohol 210 96 18
El Kanka 74 23 17
Fito y Fitipaldis 180 64 17
Extremoduro 237 124 16
Leiva 123 54 16
Ayax y Prok 175 72 15

User library

Code
# Get most common artists in library (saved songs)

library_tracks_top_artist<-library_tracks%>%
  group_by(artist_name)%>%
  summarise(n_songs=n_distinct(track_name))%>%
  arrange(desc(n_songs))%>%
  ungroup()

# Display table
library_tracks_top_artist[1:10,]%>%
  transmute(
    `Artist` = artist_name,
    `N songs` = n_songs,
  )%>%
  kable(caption="Artists with more songs in library")
Artists with more songs in library
Artist N songs
Amor Líquido 1
Doja Cat 1
El Niño de la Hipoteca 1
Fools Garden 1
Gabry Ponte 1
Gala 1
Hinds 1
Jain 1
Juan Antonio Canta 1
Lucio Battisti 1
Code
# Get most common artists in library (saved albums)

library_albums_top_artist<-library_albums%>%
  group_by(artist_name)%>%
  summarise(n_albums=n_distinct(album_name))%>%
  arrange(desc(n_albums))%>%
  ungroup()

# Display table
library_albums_top_artist[1:10,]%>%
  transmute(
    `Artist` = artist_name,
    `N albums` = n_albums,
  )%>%
  kable(caption="Artists with more albums in library")
Artists with more albums in library
Artist N albums
La Maravillosa Orquesta del Alcohol 8
Extremoduro 7
Arctic Monkeys 6
Marea 6
Vetusta Morla 6
Los Zigarros 5
Supersubmarina 5
Various Artists 5
Biznaga 4
Kase.O 4

Saved in playlist or library

Code
# Combine tracks from playlists and library
saved_tracks <- unique(c(playlist_top_tracks$track_name, library_tracks$track_name))

# Combine artists from playlists and library
saved_artist <- unique(c(playlist_top_artist$artist_name,
  library_tracks_top_artist$artist_name, library_albums_top_artist$artist_name))

# Plot proportion of tracks in playlists by year
plot_saved_track <- stream %>%
  filter(year(date)>=threshold_year) %>% # Filter data from threshold year onwards
  mutate(`In my playlists` = ifelse(track_name %in% saved_tracks, "Yes", "No")) %>%
  group_by(`In my playlists`, year = as.factor(year(date))) %>%
  summarise(n = n()) %>%
  ggplot(aes(x = year, y = n, fill = `In my playlists`)) +
  geom_col(position = "fill") +
  scale_fill_manual(values = pal, na.value = "grey80") +
  scale_y_continuous(labels = function(x) paste0(x*100, "%")) +
  theme_minimal() +
  ggtitle("Were the tracks (streaming history) in the current playlists?") +
  ylab("")

# Display plot
plot_saved_track

Code
# Save plot
if(save_svg) ggsave(paste0("plot_saved_track",".svg"), path = "output_files/")
Code
# Plot proportion of artists in playlists by year
plot_saved_artist <- stream %>%
  filter(year(date)>=threshold_year) %>% # Filter data from threshold year onwards
  mutate(`In my playlists`=ifelse(artist_name%in%saved_artist,"Yes","No"))%>%
  group_by(`In my playlists`, year = as.factor(year(date))) %>%
  summarise(n = n()) %>%
  ggplot(aes(x = year, y = n, fill = `In my playlists`)) +
  geom_col(position = "fill") +
  scale_fill_manual(values = pal, na.value = "grey80") +
  scale_y_continuous(labels = function(x) paste0(x*100, "%"))+
  theme_minimal() +
  ggtitle("Were the artists (streaming history) in the current playlists?")+
  ylab("")


plot_saved_artist

Code
# Save plot
if(save_svg) ggsave(paste0("plot_saved_artist",".svg"), path = "output_files/")